Skip to content

Commit

Permalink
Update garbage collector logging and fix case where pod ref does not …
Browse files Browse the repository at this point in the history
…exist
  • Loading branch information
nstogner committed Mar 8, 2024
1 parent 5381442 commit 18cf085
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 10 deletions.
6 changes: 3 additions & 3 deletions tpu-provisioner/internal/cloud/gke.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ func (g *GKE) EnsureNodePoolForPod(p *corev1.Pod, why string) error {
}

func (g *GKE) ListNodePools() ([]NodePoolRef, error) {
var names []NodePoolRef
var refs []NodePoolRef

resp, err := g.Service.Projects.Locations.Clusters.NodePools.List(g.ClusterContext.ClusterName()).Do()
if err != nil {
Expand All @@ -113,7 +113,7 @@ func (g *GKE) ListNodePools() ([]NodePoolRef, error) {
}

for _, np := range resp.NodePools {
names = append(names, NodePoolRef{
refs = append(refs, NodePoolRef{
Name: np.Name,
Error: np.Status == "ERROR",
ErrorMsg: np.StatusMessage,
Expand All @@ -124,7 +124,7 @@ func (g *GKE) ListNodePools() ([]NodePoolRef, error) {
})
}

return names, nil
return refs, nil
}

func (g *GKE) DeleteNodePoolForNode(node *corev1.Node, why string) error {
Expand Down
18 changes: 11 additions & 7 deletions tpu-provisioner/internal/controller/nodepool_garbage_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ type NodePoolGarbageCollector struct {
}

func (g *NodePoolGarbageCollector) Run(ctx context.Context) {
log := ctrllog.Log
log := ctrllog.Log.WithName("nodepool-garbage-collector")

t := time.NewTicker(g.Interval)

Expand All @@ -39,15 +39,21 @@ func (g *NodePoolGarbageCollector) Run(ctx context.Context) {
}

for _, np := range nodepools {
log = log.WithValues("nodepool", np.Name)

if !np.Error {
continue
}

if np.CreatedForPod.Name == "" || np.CreatedForPod.Namespace == "" {
log.Info("skipping garbage collection of node pool, no pod reference")
continue
}

// Check if the Pod that triggered the Node Pool creation still exists.
err := g.Get(ctx, np.CreatedForPod, &v1.Pod{})
if err == nil {
log.Info("skipping garbage collection of node pool, pod still exists",
"nodepool", np.Name,
"podName", np.CreatedForPod.Name,
"podNamespace", np.CreatedForPod.Namespace,
)
Expand All @@ -66,17 +72,15 @@ func (g *NodePoolGarbageCollector) Run(ctx context.Context) {
continue
}
if len(nodes.Items) > 0 {
log.Info("skipping garbage collection of node pool, nodes exist",
"nodepool", np.Name,
)
log.Info("skipping garbage collection of node pool, nodes exist")
continue
}

log.Info("garbage collecting node pool in error state", "nodepool", np.Name)
log.Info("garbage collecting node pool in error state")
// TODO: Lookup namespace from env with downward API.
if err := g.Provider.DeleteNodePool(np.Name, &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "tpu-provisioner-system"}},
"the node pool has no corresponding Nodes, the Pod that triggered its creation no longer exists, and node pool is in an error state: "+np.ErrorMsg); err != nil {
log.Error(err, "failed to garbage colelct node pool", "nodepool", np.Name)
log.Error(err, "failed to garbage colelct node pool")
continue
}
}
Expand Down

0 comments on commit 18cf085

Please sign in to comment.