From 4ea225160acfe8197e9da0dd1042361065025d40 Mon Sep 17 00:00:00 2001 From: Peter Bacsko Date: Thu, 3 Oct 2024 17:21:25 +0200 Subject: [PATCH] [YUNIKORN-2897] Health checker reports foreign allocation as orphan (#977) Closes: #977 Signed-off-by: Peter Bacsko --- pkg/scheduler/health_checker.go | 2 +- pkg/scheduler/health_checker_test.go | 9 +++++++ pkg/scheduler/objects/node.go | 13 ---------- pkg/scheduler/objects/node_test.go | 23 ------------------ .../objects/required_node_preemptor.go | 2 +- pkg/scheduler/partition.go | 2 +- pkg/scheduler/partition_test.go | 24 +++++++++---------- pkg/scheduler/tests/recovery_test.go | 4 ++-- 8 files changed, 26 insertions(+), 53 deletions(-) diff --git a/pkg/scheduler/health_checker.go b/pkg/scheduler/health_checker.go index 177d9b8d4..eaf1ea698 100644 --- a/pkg/scheduler/health_checker.go +++ b/pkg/scheduler/health_checker.go @@ -330,7 +330,7 @@ func checkAppAllocations(app *objects.Application, nodes objects.NodeCollection) func checkNodeAllocations(node *objects.Node, partitionContext *PartitionContext) []*objects.Allocation { orphanAllocationsOnNode := make([]*objects.Allocation, 0) - for _, alloc := range node.GetAllAllocations() { + for _, alloc := range node.GetYunikornAllocations() { if app := partitionContext.getApplication(alloc.GetApplicationID()); app != nil { if !app.IsAllocationAssignedToApp(alloc) { orphanAllocationsOnNode = append(orphanAllocationsOnNode, alloc) diff --git a/pkg/scheduler/health_checker_test.go b/pkg/scheduler/health_checker_test.go index 407f3f225..02402fc9f 100644 --- a/pkg/scheduler/health_checker_test.go +++ b/pkg/scheduler/health_checker_test.go @@ -225,6 +225,15 @@ func TestGetSchedulerHealthStatusContext(t *testing.T) { healthInfo = GetSchedulerHealthStatus(schedulerMetrics, schedulerContext) assert.Assert(t, healthInfo.HealthChecks[7].Succeeded, "The orphan allocation check on the node should be successful") assert.Assert(t, !healthInfo.HealthChecks[8].Succeeded, "The orphan allocation check on the app should not be successful") + part.removeApplication("appID") + healthInfo = GetSchedulerHealthStatus(schedulerMetrics, schedulerContext) + assert.Assert(t, healthInfo.HealthChecks[8].Succeeded, "The orphan allocation check on the app still fails after removing the app") + + // check that foreign allocation does not interfere with health check + falloc := newForeignAllocation("foreign-1", "node") + node.AddAllocation(falloc) + healthInfo = GetSchedulerHealthStatus(schedulerMetrics, schedulerContext) + assert.Assert(t, healthInfo.HealthChecks[7].Succeeded, "Foreign allocation was detected as orphan") } func TestGetSchedulerHealthStatusMetrics(t *testing.T) { diff --git a/pkg/scheduler/objects/node.go b/pkg/scheduler/objects/node.go index a4907111f..db51fa746 100644 --- a/pkg/scheduler/objects/node.go +++ b/pkg/scheduler/objects/node.go @@ -230,19 +230,6 @@ func (sn *Node) GetAllocation(allocationKey string) *Allocation { return sn.allocations[allocationKey] } -// Get a copy of the allocations on this node -func (sn *Node) GetAllAllocations() []*Allocation { - sn.RLock() - defer sn.RUnlock() - - arr := make([]*Allocation, 0) - for _, v := range sn.allocations { - arr = append(arr, v) - } - - return arr -} - // GetYunikornAllocations returns a copy of Yunikorn allocations on this node func (sn *Node) GetYunikornAllocations() []*Allocation { sn.RLock() diff --git a/pkg/scheduler/objects/node_test.go b/pkg/scheduler/objects/node_test.go index 90a30e620..4b38e4360 100644 --- a/pkg/scheduler/objects/node_test.go +++ b/pkg/scheduler/objects/node_test.go @@ -688,29 +688,6 @@ func TestGetAllocation(t *testing.T) { } } -func TestGetAllAllocations(t *testing.T) { - node := newNode("node-123", map[string]resources.Quantity{"first": 100, "second": 200}) - if !resources.IsZero(node.GetAllocatedResource()) { - t.Fatal("Failed to initialize resource") - } - - // nothing allocated get an empty list - allocs := node.GetAllAllocations() - if allocs == nil || len(allocs) != 0 { - t.Fatalf("allocation length should be 0 on new node") - } - alloc1 := newAllocation(appID1, nodeID1, nil) - alloc2 := newAllocation(appID1, nodeID1, nil) - - // allocate - node.AddAllocation(alloc1) - node.AddAllocation(alloc2) - assert.Equal(t, 2, len(node.GetAllAllocations()), "allocation length mismatch") - // This should not happen in real code just making sure the code does do what is expected - node.AddAllocation(alloc2) - assert.Equal(t, 2, len(node.GetAllAllocations()), "allocation length mismatch") -} - func TestSchedulingState(t *testing.T) { node := newNode("node-123", nil) if !node.IsSchedulable() { diff --git a/pkg/scheduler/objects/required_node_preemptor.go b/pkg/scheduler/objects/required_node_preemptor.go index 39ac81e56..507eb8685 100644 --- a/pkg/scheduler/objects/required_node_preemptor.go +++ b/pkg/scheduler/objects/required_node_preemptor.go @@ -45,7 +45,7 @@ func NewRequiredNodePreemptor(node *Node, requiredAsk *Allocation) *PreemptionCo func (p *PreemptionContext) filterAllocations() { p.Lock() defer p.Unlock() - for _, allocation := range p.node.GetAllAllocations() { + for _, allocation := range p.node.GetYunikornAllocations() { // skip daemon set pods and higher priority allocation if allocation.GetRequiredNode() != "" || allocation.GetPriority() > p.requiredAsk.GetPriority() { continue diff --git a/pkg/scheduler/partition.go b/pkg/scheduler/partition.go index 0db2dbda4..b3d72ba06 100644 --- a/pkg/scheduler/partition.go +++ b/pkg/scheduler/partition.go @@ -672,7 +672,7 @@ func (pc *PartitionContext) removeNodeAllocations(node *objects.Node) ([]*object released := make([]*objects.Allocation, 0) confirmed := make([]*objects.Allocation, 0) // walk over all allocations still registered for this node - for _, alloc := range node.GetAllAllocations() { + for _, alloc := range node.GetYunikornAllocations() { allocationKey := alloc.GetAllocationKey() // since we are not locking the node and or application we could have had an update while processing // note that we do not return the allocation if the app or allocation is not found and assume that it diff --git a/pkg/scheduler/partition_test.go b/pkg/scheduler/partition_test.go index a086d3263..8a16d34dc 100644 --- a/pkg/scheduler/partition_test.go +++ b/pkg/scheduler/partition_test.go @@ -267,7 +267,7 @@ func TestRemoveNodeWithAllocations(t *testing.T) { assert.NilError(t, err) assert.Check(t, allocCreated) // get what was allocated - allocated := node.GetAllAllocations() + allocated := node.GetYunikornAllocations() assert.Equal(t, 1, len(allocated), "allocation not added correctly") assertLimits(t, getTestUserGroup(), appRes) @@ -316,7 +316,7 @@ func TestRemoveNodeWithPlaceholders(t *testing.T) { assert.NilError(t, err) assert.Check(t, allocCreated) // get what was allocated - allocated := node1.GetAllAllocations() + allocated := node1.GetYunikornAllocations() assert.Equal(t, 1, len(allocated), "allocation not added correctly to node1 expected 1 got: %v", allocated) assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), appRes), "allocation not added correctly to node1") assertLimits(t, getTestUserGroup(), appRes) @@ -432,7 +432,7 @@ func TestPlaceholderDataWithPlaceholderPreemption(t *testing.T) { assert.Check(t, allocCreated) // get what was allocated - allocated := node1.GetAllAllocations() + allocated := node1.GetYunikornAllocations() assert.Equal(t, 1, len(allocated), "allocation not added correctly to node1") assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), appRes), "allocation not added correctly to node1") @@ -561,7 +561,7 @@ func TestPlaceholderDataWithNodeRemoval(t *testing.T) { assert.Check(t, allocCreated) // get what was allocated - allocated := node1.GetAllAllocations() + allocated := node1.GetYunikornAllocations() assert.Equal(t, 1, len(allocated), "allocation not added correctly to node1") assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), appRes), "allocation not added correctly to node1") @@ -648,7 +648,7 @@ func TestPlaceholderDataWithRemoval(t *testing.T) { assert.Check(t, allocCreated) // get what was allocated - allocated := node1.GetAllAllocations() + allocated := node1.GetYunikornAllocations() assert.Equal(t, 1, len(allocated), "allocation not added correctly to node1") assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), appRes), "allocation not added correctly to node1") @@ -733,7 +733,7 @@ func TestRemoveNodeWithReplacement(t *testing.T) { assert.Check(t, allocCreated) // get what was allocated - allocated := node1.GetAllAllocations() + allocated := node1.GetYunikornAllocations() assert.Equal(t, 1, len(allocated), "allocation not added correctly to node1") assertLimits(t, getTestUserGroup(), appRes) assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), appRes), "allocation not added correctly to node1") @@ -753,7 +753,7 @@ func TestRemoveNodeWithReplacement(t *testing.T) { alloc := newAllocationAll(allocKey, appID1, nodeID2, taskGroup, appRes, 1, false) alloc.SetRelease(ph) node2.AddAllocation(alloc) - allocated = node2.GetAllAllocations() + allocated = node2.GetYunikornAllocations() assert.Equal(t, 1, len(allocated), "allocation not added correctly to node2") assert.Assert(t, resources.Equals(node2.GetAllocatedResource(), appRes), "allocation not added correctly to node2 (resource count)") assertLimits(t, getTestUserGroup(), appRes) @@ -768,7 +768,7 @@ func TestRemoveNodeWithReplacement(t *testing.T) { // remove the node with the placeholder released, confirmed := partition.removeNode(nodeID1) assert.Equal(t, 1, partition.GetTotalNodeCount(), "node list was not updated, node was not removed") - assert.Equal(t, 1, len(node2.GetAllAllocations()), "remaining node should have allocation") + assert.Equal(t, 1, len(node2.GetYunikornAllocations()), "remaining node should have allocation") assert.Equal(t, 1, len(released), "node removal did not release correct allocation") assert.Equal(t, 1, len(confirmed), "node removal did not confirm correct allocation") assert.Equal(t, ph.GetAllocationKey(), released[0].GetAllocationKey(), "allocationKey returned by release not the same as the placeholder") @@ -807,7 +807,7 @@ func TestRemoveNodeWithReal(t *testing.T) { assert.NilError(t, err) assert.Check(t, allocCreated) // get what was allocated - allocated := node1.GetAllAllocations() + allocated := node1.GetYunikornAllocations() assert.Equal(t, 1, len(allocated), "allocation not added correctly to node1") assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), appRes), "allocation not added correctly to node1") assertLimits(t, getTestUserGroup(), appRes) @@ -827,7 +827,7 @@ func TestRemoveNodeWithReal(t *testing.T) { alloc := newAllocationAll(allocKey, appID1, nodeID2, taskGroup, appRes, 1, false) alloc.SetRelease(ph) node2.AddAllocation(alloc) - allocated = node2.GetAllAllocations() + allocated = node2.GetYunikornAllocations() assert.Equal(t, 1, len(allocated), "allocation not added correctly to node2") assert.Assert(t, resources.Equals(node2.GetAllocatedResource(), appRes), "allocation not added correctly to node2 (resource count)") assertLimits(t, getTestUserGroup(), appRes) @@ -4729,7 +4729,7 @@ func TestForeignAllocation(t *testing.T) { assert.NilError(t, err) assert.Equal(t, 1, len(partition.foreignAllocs)) assert.Equal(t, nodeID1, partition.foreignAllocs[foreignAlloc1]) - assert.Equal(t, 1, len(node1.GetAllAllocations())) + assert.Equal(t, 0, len(node1.GetYunikornAllocations())) assert.Assert(t, node1.GetAllocation(foreignAlloc1) != nil) // remove allocation @@ -4739,6 +4739,6 @@ func TestForeignAllocation(t *testing.T) { assert.Assert(t, released == nil) assert.Assert(t, confirmed == nil) assert.Equal(t, 0, len(partition.foreignAllocs)) - assert.Equal(t, 0, len(node1.GetAllAllocations())) + assert.Equal(t, 0, len(node1.GetYunikornAllocations())) assert.Assert(t, node1.GetAllocation(foreignAlloc1) == nil) } diff --git a/pkg/scheduler/tests/recovery_test.go b/pkg/scheduler/tests/recovery_test.go index eeb9b8f4e..a71020772 100644 --- a/pkg/scheduler/tests/recovery_test.go +++ b/pkg/scheduler/tests/recovery_test.go @@ -298,8 +298,8 @@ func TestSchedulerRecovery(t *testing.T) { // verify nodes assert.Equal(t, 2, part.GetTotalNodeCount(), "incorrect recovered node count") - assert.Equal(t, len(node1Allocations), len(part.GetNode("node-1:1234").GetAllAllocations()), "allocations on node-1 not as expected") - assert.Equal(t, len(node2Allocations), len(part.GetNode("node-2:1234").GetAllAllocations()), "allocations on node-1 not as expected") + assert.Equal(t, len(node1Allocations), len(part.GetNode("node-1:1234").GetYunikornAllocations()), "allocations on node-1 not as expected") + assert.Equal(t, len(node2Allocations), len(part.GetNode("node-2:1234").GetYunikornAllocations()), "allocations on node-1 not as expected") node1AllocatedMemory := part.GetNode("node-1:1234").GetAllocatedResource().Resources[common.Memory] node2AllocatedMemory := part.GetNode("node-2:1234").GetAllocatedResource().Resources[common.Memory]