Skip to content

Commit

Permalink
[YUNIKORN-2897] Health checker reports foreign allocation as orphan (#…
Browse files Browse the repository at this point in the history
…977)

Closes: #977

Signed-off-by: Peter Bacsko <[email protected]>
  • Loading branch information
pbacsko committed Oct 3, 2024
1 parent a2d3d43 commit 4ea2251
Show file tree
Hide file tree
Showing 8 changed files with 26 additions and 53 deletions.
2 changes: 1 addition & 1 deletion pkg/scheduler/health_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ func checkAppAllocations(app *objects.Application, nodes objects.NodeCollection)

func checkNodeAllocations(node *objects.Node, partitionContext *PartitionContext) []*objects.Allocation {
orphanAllocationsOnNode := make([]*objects.Allocation, 0)
for _, alloc := range node.GetAllAllocations() {
for _, alloc := range node.GetYunikornAllocations() {
if app := partitionContext.getApplication(alloc.GetApplicationID()); app != nil {
if !app.IsAllocationAssignedToApp(alloc) {
orphanAllocationsOnNode = append(orphanAllocationsOnNode, alloc)
Expand Down
9 changes: 9 additions & 0 deletions pkg/scheduler/health_checker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,15 @@ func TestGetSchedulerHealthStatusContext(t *testing.T) {
healthInfo = GetSchedulerHealthStatus(schedulerMetrics, schedulerContext)
assert.Assert(t, healthInfo.HealthChecks[7].Succeeded, "The orphan allocation check on the node should be successful")
assert.Assert(t, !healthInfo.HealthChecks[8].Succeeded, "The orphan allocation check on the app should not be successful")
part.removeApplication("appID")
healthInfo = GetSchedulerHealthStatus(schedulerMetrics, schedulerContext)
assert.Assert(t, healthInfo.HealthChecks[8].Succeeded, "The orphan allocation check on the app still fails after removing the app")

// check that foreign allocation does not interfere with health check
falloc := newForeignAllocation("foreign-1", "node")
node.AddAllocation(falloc)
healthInfo = GetSchedulerHealthStatus(schedulerMetrics, schedulerContext)
assert.Assert(t, healthInfo.HealthChecks[7].Succeeded, "Foreign allocation was detected as orphan")
}

func TestGetSchedulerHealthStatusMetrics(t *testing.T) {
Expand Down
13 changes: 0 additions & 13 deletions pkg/scheduler/objects/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,19 +230,6 @@ func (sn *Node) GetAllocation(allocationKey string) *Allocation {
return sn.allocations[allocationKey]
}

// Get a copy of the allocations on this node
func (sn *Node) GetAllAllocations() []*Allocation {
sn.RLock()
defer sn.RUnlock()

arr := make([]*Allocation, 0)
for _, v := range sn.allocations {
arr = append(arr, v)
}

return arr
}

// GetYunikornAllocations returns a copy of Yunikorn allocations on this node
func (sn *Node) GetYunikornAllocations() []*Allocation {
sn.RLock()
Expand Down
23 changes: 0 additions & 23 deletions pkg/scheduler/objects/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -688,29 +688,6 @@ func TestGetAllocation(t *testing.T) {
}
}

func TestGetAllAllocations(t *testing.T) {
node := newNode("node-123", map[string]resources.Quantity{"first": 100, "second": 200})
if !resources.IsZero(node.GetAllocatedResource()) {
t.Fatal("Failed to initialize resource")
}

// nothing allocated get an empty list
allocs := node.GetAllAllocations()
if allocs == nil || len(allocs) != 0 {
t.Fatalf("allocation length should be 0 on new node")
}
alloc1 := newAllocation(appID1, nodeID1, nil)
alloc2 := newAllocation(appID1, nodeID1, nil)

// allocate
node.AddAllocation(alloc1)
node.AddAllocation(alloc2)
assert.Equal(t, 2, len(node.GetAllAllocations()), "allocation length mismatch")
// This should not happen in real code just making sure the code does do what is expected
node.AddAllocation(alloc2)
assert.Equal(t, 2, len(node.GetAllAllocations()), "allocation length mismatch")
}

func TestSchedulingState(t *testing.T) {
node := newNode("node-123", nil)
if !node.IsSchedulable() {
Expand Down
2 changes: 1 addition & 1 deletion pkg/scheduler/objects/required_node_preemptor.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ func NewRequiredNodePreemptor(node *Node, requiredAsk *Allocation) *PreemptionCo
func (p *PreemptionContext) filterAllocations() {
p.Lock()
defer p.Unlock()
for _, allocation := range p.node.GetAllAllocations() {
for _, allocation := range p.node.GetYunikornAllocations() {
// skip daemon set pods and higher priority allocation
if allocation.GetRequiredNode() != "" || allocation.GetPriority() > p.requiredAsk.GetPriority() {
continue
Expand Down
2 changes: 1 addition & 1 deletion pkg/scheduler/partition.go
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,7 @@ func (pc *PartitionContext) removeNodeAllocations(node *objects.Node) ([]*object
released := make([]*objects.Allocation, 0)
confirmed := make([]*objects.Allocation, 0)
// walk over all allocations still registered for this node
for _, alloc := range node.GetAllAllocations() {
for _, alloc := range node.GetYunikornAllocations() {
allocationKey := alloc.GetAllocationKey()
// since we are not locking the node and or application we could have had an update while processing
// note that we do not return the allocation if the app or allocation is not found and assume that it
Expand Down
24 changes: 12 additions & 12 deletions pkg/scheduler/partition_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ func TestRemoveNodeWithAllocations(t *testing.T) {
assert.NilError(t, err)
assert.Check(t, allocCreated)
// get what was allocated
allocated := node.GetAllAllocations()
allocated := node.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly")
assertLimits(t, getTestUserGroup(), appRes)

Expand Down Expand Up @@ -316,7 +316,7 @@ func TestRemoveNodeWithPlaceholders(t *testing.T) {
assert.NilError(t, err)
assert.Check(t, allocCreated)
// get what was allocated
allocated := node1.GetAllAllocations()
allocated := node1.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to node1 expected 1 got: %v", allocated)
assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), appRes), "allocation not added correctly to node1")
assertLimits(t, getTestUserGroup(), appRes)
Expand Down Expand Up @@ -432,7 +432,7 @@ func TestPlaceholderDataWithPlaceholderPreemption(t *testing.T) {
assert.Check(t, allocCreated)

// get what was allocated
allocated := node1.GetAllAllocations()
allocated := node1.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to node1")
assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), appRes), "allocation not added correctly to node1")

Expand Down Expand Up @@ -561,7 +561,7 @@ func TestPlaceholderDataWithNodeRemoval(t *testing.T) {
assert.Check(t, allocCreated)

// get what was allocated
allocated := node1.GetAllAllocations()
allocated := node1.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to node1")
assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), appRes), "allocation not added correctly to node1")

Expand Down Expand Up @@ -648,7 +648,7 @@ func TestPlaceholderDataWithRemoval(t *testing.T) {
assert.Check(t, allocCreated)

// get what was allocated
allocated := node1.GetAllAllocations()
allocated := node1.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to node1")
assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), appRes), "allocation not added correctly to node1")

Expand Down Expand Up @@ -733,7 +733,7 @@ func TestRemoveNodeWithReplacement(t *testing.T) {
assert.Check(t, allocCreated)

// get what was allocated
allocated := node1.GetAllAllocations()
allocated := node1.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to node1")
assertLimits(t, getTestUserGroup(), appRes)
assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), appRes), "allocation not added correctly to node1")
Expand All @@ -753,7 +753,7 @@ func TestRemoveNodeWithReplacement(t *testing.T) {
alloc := newAllocationAll(allocKey, appID1, nodeID2, taskGroup, appRes, 1, false)
alloc.SetRelease(ph)
node2.AddAllocation(alloc)
allocated = node2.GetAllAllocations()
allocated = node2.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to node2")
assert.Assert(t, resources.Equals(node2.GetAllocatedResource(), appRes), "allocation not added correctly to node2 (resource count)")
assertLimits(t, getTestUserGroup(), appRes)
Expand All @@ -768,7 +768,7 @@ func TestRemoveNodeWithReplacement(t *testing.T) {
// remove the node with the placeholder
released, confirmed := partition.removeNode(nodeID1)
assert.Equal(t, 1, partition.GetTotalNodeCount(), "node list was not updated, node was not removed")
assert.Equal(t, 1, len(node2.GetAllAllocations()), "remaining node should have allocation")
assert.Equal(t, 1, len(node2.GetYunikornAllocations()), "remaining node should have allocation")
assert.Equal(t, 1, len(released), "node removal did not release correct allocation")
assert.Equal(t, 1, len(confirmed), "node removal did not confirm correct allocation")
assert.Equal(t, ph.GetAllocationKey(), released[0].GetAllocationKey(), "allocationKey returned by release not the same as the placeholder")
Expand Down Expand Up @@ -807,7 +807,7 @@ func TestRemoveNodeWithReal(t *testing.T) {
assert.NilError(t, err)
assert.Check(t, allocCreated)
// get what was allocated
allocated := node1.GetAllAllocations()
allocated := node1.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to node1")
assert.Assert(t, resources.Equals(node1.GetAllocatedResource(), appRes), "allocation not added correctly to node1")
assertLimits(t, getTestUserGroup(), appRes)
Expand All @@ -827,7 +827,7 @@ func TestRemoveNodeWithReal(t *testing.T) {
alloc := newAllocationAll(allocKey, appID1, nodeID2, taskGroup, appRes, 1, false)
alloc.SetRelease(ph)
node2.AddAllocation(alloc)
allocated = node2.GetAllAllocations()
allocated = node2.GetYunikornAllocations()
assert.Equal(t, 1, len(allocated), "allocation not added correctly to node2")
assert.Assert(t, resources.Equals(node2.GetAllocatedResource(), appRes), "allocation not added correctly to node2 (resource count)")
assertLimits(t, getTestUserGroup(), appRes)
Expand Down Expand Up @@ -4729,7 +4729,7 @@ func TestForeignAllocation(t *testing.T) {
assert.NilError(t, err)
assert.Equal(t, 1, len(partition.foreignAllocs))
assert.Equal(t, nodeID1, partition.foreignAllocs[foreignAlloc1])
assert.Equal(t, 1, len(node1.GetAllAllocations()))
assert.Equal(t, 0, len(node1.GetYunikornAllocations()))
assert.Assert(t, node1.GetAllocation(foreignAlloc1) != nil)

// remove allocation
Expand All @@ -4739,6 +4739,6 @@ func TestForeignAllocation(t *testing.T) {
assert.Assert(t, released == nil)
assert.Assert(t, confirmed == nil)
assert.Equal(t, 0, len(partition.foreignAllocs))
assert.Equal(t, 0, len(node1.GetAllAllocations()))
assert.Equal(t, 0, len(node1.GetYunikornAllocations()))
assert.Assert(t, node1.GetAllocation(foreignAlloc1) == nil)
}
4 changes: 2 additions & 2 deletions pkg/scheduler/tests/recovery_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -298,8 +298,8 @@ func TestSchedulerRecovery(t *testing.T) {
// verify nodes
assert.Equal(t, 2, part.GetTotalNodeCount(), "incorrect recovered node count")

assert.Equal(t, len(node1Allocations), len(part.GetNode("node-1:1234").GetAllAllocations()), "allocations on node-1 not as expected")
assert.Equal(t, len(node2Allocations), len(part.GetNode("node-2:1234").GetAllAllocations()), "allocations on node-1 not as expected")
assert.Equal(t, len(node1Allocations), len(part.GetNode("node-1:1234").GetYunikornAllocations()), "allocations on node-1 not as expected")
assert.Equal(t, len(node2Allocations), len(part.GetNode("node-2:1234").GetYunikornAllocations()), "allocations on node-1 not as expected")

node1AllocatedMemory := part.GetNode("node-1:1234").GetAllocatedResource().Resources[common.Memory]
node2AllocatedMemory := part.GetNode("node-2:1234").GetAllocatedResource().Resources[common.Memory]
Expand Down

0 comments on commit 4ea2251

Please sign in to comment.