Skip to content

Commit

Permalink
[YUNIKORN-959] Simplify partition info in the REST call
Browse files Browse the repository at this point in the history
  • Loading branch information
steinsgateted committed Jan 22, 2023
1 parent a10f54e commit 44d5ff7
Show file tree
Hide file tree
Showing 8 changed files with 101 additions and 32 deletions.
16 changes: 6 additions & 10 deletions pkg/scheduler/partition.go
Original file line number Diff line number Diff line change
Expand Up @@ -973,12 +973,6 @@ func (pc *PartitionContext) GetAllocatedResource() *resources.Resource {
return pc.root.GetAllocatedResource()
}

func (pc *PartitionContext) GetTotalApplicationCount() int {
pc.RLock()
defer pc.RUnlock()
return len(pc.applications)
}

func (pc *PartitionContext) GetTotalAllocationCount() int {
pc.RLock()
defer pc.RUnlock()
Expand Down Expand Up @@ -1168,10 +1162,12 @@ func (pc *PartitionContext) convertUGI(ugi *si.UserGroupInformation) (security.U
// which is a slice with 10 elements,
// each element represents a range of resource usage,
// such as
// 0: 0%->10%
// 1: 10% -> 20%
// ...
// 9: 90% -> 100%
//
// 0: 0%->10%
// 1: 10% -> 20%
// ...
// 9: 90% -> 100%
//
// the element value represents number of nodes fall into this bucket.
// if slice[9] = 3, this means there are 3 nodes resource usage is in the range 80% to 90%.
//
Expand Down
2 changes: 0 additions & 2 deletions pkg/scheduler/tests/recovery_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -514,7 +514,6 @@ func TestSchedulerRecoveryWithoutAppInfo(t *testing.T) {
// verify partition resources
part := ms.scheduler.GetClusterContext().GetPartition("[rm:123]default")
assert.Equal(t, part.GetTotalNodeCount(), 1)
assert.Equal(t, part.GetTotalApplicationCount(), 0)
assert.Equal(t, part.GetTotalAllocationCount(), 0)
assert.Equal(t, part.GetNode("node-2:1234").GetAllocatedResource().Resources[common.Memory],
resources.Quantity(0))
Expand Down Expand Up @@ -566,7 +565,6 @@ func TestSchedulerRecoveryWithoutAppInfo(t *testing.T) {
ms.mockRM.waitForAcceptedNode(t, "node-1:1234", 1000)

assert.Equal(t, part.GetTotalNodeCount(), 2)
assert.Equal(t, part.GetTotalApplicationCount(), 1)
assert.Equal(t, part.GetTotalAllocationCount(), 1)
assert.Equal(t, part.GetNode("node-1:1234").GetAllocatedResource().Resources[common.Memory], resources.Quantity(100))
assert.Equal(t, part.GetNode("node-1:1234").GetAllocatedResource().Resources[common.CPU], resources.Quantity(1))
Expand Down
26 changes: 26 additions & 0 deletions pkg/webservice/dao/cluster_info.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/*
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package dao

type ClusterDAOInfo struct {
StartTime int64 `json:"startTime"`
RMBuildInformation []map[string]string `json:"rmBuildInformation"`
PartitionName string `json:"partition"`
ClusterName string `json:"clusterName"`
}
20 changes: 9 additions & 11 deletions pkg/webservice/dao/partition_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,15 @@
package dao

type PartitionInfo struct {
ClusterID string `json:"clusterId"`
Name string `json:"name"`
StartTime int64 `json:"startTime"`
RMBuildInformation []map[string]string `json:"rmBuildInformation"`
Capacity PartitionCapacity `json:"capacity"`
NodeSortingPolicy NodeSortingPolicy `json:"nodeSortingPolicy"`
TotalNodes int `json:"totalNodes"`
Applications map[string]int `json:"applications"`
TotalContainers int `json:"totalContainers"`
State string `json:"state"`
LastStateTransitionTime int64 `json:"lastStateTransitionTime"`
ClusterID string `json:"clusterId"`
Name string `json:"name"`
Capacity PartitionCapacity `json:"capacity"`
NodeSortingPolicy NodeSortingPolicy `json:"nodeSortingPolicy"`
TotalNodes int `json:"totalNodes"`
Applications map[string]int `json:"applications"`
TotalContainers int `json:"totalContainers"`
State string `json:"state"`
LastStateTransitionTime int64 `json:"lastStateTransitionTime"`
}

type PartitionCapacity struct {
Expand Down
39 changes: 30 additions & 9 deletions pkg/webservice/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,16 @@ func getStackInfo(w http.ResponseWriter, r *http.Request) {
}
}

func getClusterInfo(w http.ResponseWriter, r *http.Request) {
writeHeaders(w)

lists := schedulerContext.GetPartitionMapClone()
clustersInfo := getClusterDAO(lists)
if err := json.NewEncoder(w).Encode(clustersInfo); err != nil {
buildJSONErrorResponse(w, err.Error(), http.StatusInternalServerError)
}
}

func validateQueue(queuePath string) error {
if queuePath != "" {
queueNameArr := strings.Split(queuePath, ".")
Expand Down Expand Up @@ -120,6 +130,16 @@ func buildJSONErrorResponse(w http.ResponseWriter, detail string, code int) {
}
}

func getClusterJSON(partition *scheduler.PartitionContext) *dao.ClusterDAOInfo {
clusterInfo := &dao.ClusterDAOInfo{}
clusterInfo.StartTime = schedulerContext.GetStartTime().UnixNano()
rmInfo := schedulerContext.GetRMInfoMapClone()
clusterInfo.RMBuildInformation = getRMBuildInformation(rmInfo)
clusterInfo.PartitionName = common.GetPartitionNameWithoutClusterID(partition.Name)
clusterInfo.ClusterName = "kubernetes"
return clusterInfo
}

func getClusterUtilJSON(partition *scheduler.PartitionContext) []*dao.ClusterUtilDAOInfo {
var utils []*dao.ClusterUtilDAOInfo
var getResource bool = true
Expand Down Expand Up @@ -576,14 +596,8 @@ func getPartitionInfoDAO(lists map[string]*scheduler.PartitionContext) []*dao.Pa
partitionInfo := &dao.PartitionInfo{}
partitionInfo.ClusterID = partitionContext.RmID
partitionInfo.Name = common.GetPartitionNameWithoutClusterID(partitionContext.Name)

partitionInfo.StartTime = schedulerContext.GetStartTime().UnixNano()
rmInfo := schedulerContext.GetRMInfoMapClone()
partitionInfo.RMBuildInformation = getRMBuildInformation(rmInfo)

partitionInfo.State = partitionContext.GetCurrentState()
partitionInfo.LastStateTransitionTime = partitionContext.GetStateTime().UnixNano()

capacityInfo := dao.PartitionCapacity{}
capacity := partitionContext.GetTotalPartitionResource()
usedCapacity := partitionContext.GetAllocatedResource()
Expand All @@ -597,7 +611,6 @@ func getPartitionInfoDAO(lists map[string]*scheduler.PartitionContext) []*dao.Pa
}

partitionInfo.TotalNodes = partitionContext.GetTotalNodeCount()

appList := partitionContext.GetApplications()
appList = append(appList, partitionContext.GetCompletedApplications()...)
appList = append(appList, partitionContext.GetRejectedApplications()...)
Expand All @@ -609,9 +622,7 @@ func getPartitionInfoDAO(lists map[string]*scheduler.PartitionContext) []*dao.Pa
}
applicationsState["total"] = totalApplications
partitionInfo.Applications = applicationsState

partitionInfo.TotalContainers = partitionContext.GetTotalAllocationCount()

result = append(result, partitionInfo)
}

Expand Down Expand Up @@ -698,6 +709,16 @@ func getPartitionQueuesDAO(lists map[string]*scheduler.PartitionContext) []dao.P
return result
}

func getClusterDAO(lists map[string]*scheduler.PartitionContext) []*dao.ClusterDAOInfo {
var result []*dao.ClusterDAOInfo

for _, partition := range lists {
result = append(result, getClusterJSON(partition))
}

return result
}

func getRMBuildInformation(lists map[string]*scheduler.RMInformation) []map[string]string {
var result []map[string]string

Expand Down
20 changes: 20 additions & 0 deletions pkg/webservice/handlers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -760,6 +760,25 @@ func TestGetPartitionQueuesHandler(t *testing.T) {
assertPartitionExists(t, resp)
}

func TestGetClusterInfo(t *testing.T) {
setup(t, configTwoLevelQueues, 2)

resp := &MockResponseWriter{}
getClusterInfo(resp, nil)
var data []*dao.ClusterDAOInfo
err := json.Unmarshal(resp.outputBytes, &data)
assert.NilError(t, err)
assert.Equal(t, 2, len(data))

cs := make(map[string]*dao.ClusterDAOInfo, 2)
for _, d := range data {
cs[d.PartitionName] = d
}

assert.Assert(t, cs["default"] != nil)
assert.Assert(t, cs["gpu"] != nil)
}

func TestGetPartitionNodes(t *testing.T) {
partition := setup(t, configDefault, 1)

Expand Down Expand Up @@ -1328,6 +1347,7 @@ func verifyStateDumpJSON(t *testing.T, aggregated *AggregatedStateInfo) {
assert.Check(t, aggregated.Timestamp != 0)
assert.Check(t, len(aggregated.Partitions) > 0)
assert.Check(t, len(aggregated.Nodes) > 0)
assert.Check(t, len(aggregated.ClusterInfo) > 0)
assert.Check(t, len(aggregated.Queues) > 0)
assert.Check(t, len(aggregated.LogLevel) > 0)
}
8 changes: 8 additions & 0 deletions pkg/webservice/routes.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,14 @@ type route struct {
type routes []route

var webRoutes = routes{
// endpoints to retrieve general cluster info
route{
"Cluster",
"GET",
"/ws/v1/clusters",
getClusterInfo,
},

// endpoint to retrieve goroutines info
route{
"Scheduler",
Expand Down
2 changes: 2 additions & 0 deletions pkg/webservice/state_dump.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ type AggregatedStateInfo struct {
Applications []*dao.ApplicationDAOInfo
AppHistory []*dao.ApplicationHistoryDAOInfo
Nodes []*dao.NodesDAOInfo
ClusterInfo []*dao.ClusterDAOInfo
ContainerHistory []*dao.ContainerHistoryDAOInfo
Queues []dao.PartitionQueueDAOInfo
RMDiagnostics map[string]interface{}
Expand Down Expand Up @@ -74,6 +75,7 @@ func doStateDump(w io.Writer) error {
Applications: getApplicationsDAO(partitionContext),
AppHistory: getAppHistoryDAO(records),
Nodes: getNodesDAO(partitionContext),
ClusterInfo: getClusterDAO(partitionContext),
ContainerHistory: getContainerHistoryDAO(records),
Queues: getPartitionQueuesDAO(partitionContext),
RMDiagnostics: getResourceManagerDiagnostics(),
Expand Down

0 comments on commit 44d5ff7

Please sign in to comment.