From b10160fc299d9fe6062ce95661e31827ed62cf40 Mon Sep 17 00:00:00 2001 From: Abhishek Malvankar Date: Mon, 7 Aug 2023 10:39:18 -0400 Subject: [PATCH] add docs accounting and resolve merge issue --- pkg/controller/queuejob/queuejob_controller_ex.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 8f1a71f8e..9d61bbbd2 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -718,6 +718,7 @@ func (qjm *XController) addTotalSnapshotResourcesConsumedByAw(totalgpu int32, to func (qjm *XController) getAggregatedAvailableResourcesPriority(unallocatedClusterResources *clusterstateapi. Resource, targetpr float64, requestingJob *arbv1.AppWrapper, agentId string) (*clusterstateapi.Resource, []*arbv1.AppWrapper) { + //get available free resources in the cluster. r := unallocatedClusterResources.Clone() // Track preemption resources preemptable := clusterstateapi.EmptyResource() @@ -732,7 +733,10 @@ func (qjm *XController) getAggregatedAvailableResourcesPriority(unallocatedClust klog.Errorf("[getAggAvaiResPri] Unable to obtain the list of queueJobs %+v", err) return r, nil } - + //for all AWs that have canRun status are true + //in non-preemption mode, we reserve resources for AWs + //reserving is done by subtracting total AW resources from pods owned by AW that are running or completed. + // AW can be running but items owned by it can be completed or there might be new set of pods yet to be spawned for _, value := range queueJobs { klog.V(10).Infof("[getAggAvaiResPri] %s: Evaluating job: %s to calculate aggregated resources.", time.Now().String(), value.Name) if value.Name == requestingJob.Name { @@ -797,10 +801,11 @@ func (qjm *XController) getAggregatedAvailableResourcesPriority(unallocatedClust totalResource := qjm.addTotalSnapshotResourcesConsumedByAw(value.Status.TotalGPU, value.Status.TotalCPU, value.Status.TotalMemory) klog.V(6).Infof("[getAggAvaiResPri] total resources consumed by Appwrapper %v when CanRun are %v", value.Name, totalResource) - pending, err = qjv.NonNegSub(totalResource) + delta, err := qjv.NonNegSub(totalResource) + pending = pending.Add(delta) if err != nil { klog.Warningf("[getAggAvaiResPri] Subtraction of resources failed, adding entire appwrapper resoources %v, %v", qjv, err) - pending = qjv + pending = pending.Add(qjv) } klog.V(6).Infof("[getAggAvaiResPri] The value of pending is %v", pending) continue