From 44d5ff76fd64e0fadc10fd70668ccaaa761ddaa9 Mon Sep 17 00:00:00 2001 From: steinsgateted Date: Sun, 22 Jan 2023 16:40:26 +0800 Subject: [PATCH] [YUNIKORN-959] Simplify partition info in the REST call --- pkg/scheduler/partition.go | 16 +++++------- pkg/scheduler/tests/recovery_test.go | 2 -- pkg/webservice/dao/cluster_info.go | 26 +++++++++++++++++++ pkg/webservice/dao/partition_info.go | 20 +++++++------- pkg/webservice/handlers.go | 39 +++++++++++++++++++++------- pkg/webservice/handlers_test.go | 20 ++++++++++++++ pkg/webservice/routes.go | 8 ++++++ pkg/webservice/state_dump.go | 2 ++ 8 files changed, 101 insertions(+), 32 deletions(-) create mode 100644 pkg/webservice/dao/cluster_info.go diff --git a/pkg/scheduler/partition.go b/pkg/scheduler/partition.go index 70678ded5..45680b6f8 100644 --- a/pkg/scheduler/partition.go +++ b/pkg/scheduler/partition.go @@ -973,12 +973,6 @@ func (pc *PartitionContext) GetAllocatedResource() *resources.Resource { return pc.root.GetAllocatedResource() } -func (pc *PartitionContext) GetTotalApplicationCount() int { - pc.RLock() - defer pc.RUnlock() - return len(pc.applications) -} - func (pc *PartitionContext) GetTotalAllocationCount() int { pc.RLock() defer pc.RUnlock() @@ -1168,10 +1162,12 @@ func (pc *PartitionContext) convertUGI(ugi *si.UserGroupInformation) (security.U // which is a slice with 10 elements, // each element represents a range of resource usage, // such as -// 0: 0%->10% -// 1: 10% -> 20% -// ... -// 9: 90% -> 100% +// +// 0: 0%->10% +// 1: 10% -> 20% +// ... +// 9: 90% -> 100% +// // the element value represents number of nodes fall into this bucket. // if slice[9] = 3, this means there are 3 nodes resource usage is in the range 80% to 90%. // diff --git a/pkg/scheduler/tests/recovery_test.go b/pkg/scheduler/tests/recovery_test.go index fcf3371ee..b66705eb9 100644 --- a/pkg/scheduler/tests/recovery_test.go +++ b/pkg/scheduler/tests/recovery_test.go @@ -514,7 +514,6 @@ func TestSchedulerRecoveryWithoutAppInfo(t *testing.T) { // verify partition resources part := ms.scheduler.GetClusterContext().GetPartition("[rm:123]default") assert.Equal(t, part.GetTotalNodeCount(), 1) - assert.Equal(t, part.GetTotalApplicationCount(), 0) assert.Equal(t, part.GetTotalAllocationCount(), 0) assert.Equal(t, part.GetNode("node-2:1234").GetAllocatedResource().Resources[common.Memory], resources.Quantity(0)) @@ -566,7 +565,6 @@ func TestSchedulerRecoveryWithoutAppInfo(t *testing.T) { ms.mockRM.waitForAcceptedNode(t, "node-1:1234", 1000) assert.Equal(t, part.GetTotalNodeCount(), 2) - assert.Equal(t, part.GetTotalApplicationCount(), 1) assert.Equal(t, part.GetTotalAllocationCount(), 1) assert.Equal(t, part.GetNode("node-1:1234").GetAllocatedResource().Resources[common.Memory], resources.Quantity(100)) assert.Equal(t, part.GetNode("node-1:1234").GetAllocatedResource().Resources[common.CPU], resources.Quantity(1)) diff --git a/pkg/webservice/dao/cluster_info.go b/pkg/webservice/dao/cluster_info.go new file mode 100644 index 000000000..def014ffa --- /dev/null +++ b/pkg/webservice/dao/cluster_info.go @@ -0,0 +1,26 @@ +/* + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package dao + +type ClusterDAOInfo struct { + StartTime int64 `json:"startTime"` + RMBuildInformation []map[string]string `json:"rmBuildInformation"` + PartitionName string `json:"partition"` + ClusterName string `json:"clusterName"` +} diff --git a/pkg/webservice/dao/partition_info.go b/pkg/webservice/dao/partition_info.go index 54f99d1eb..34eae88c4 100644 --- a/pkg/webservice/dao/partition_info.go +++ b/pkg/webservice/dao/partition_info.go @@ -19,17 +19,15 @@ package dao type PartitionInfo struct { - ClusterID string `json:"clusterId"` - Name string `json:"name"` - StartTime int64 `json:"startTime"` - RMBuildInformation []map[string]string `json:"rmBuildInformation"` - Capacity PartitionCapacity `json:"capacity"` - NodeSortingPolicy NodeSortingPolicy `json:"nodeSortingPolicy"` - TotalNodes int `json:"totalNodes"` - Applications map[string]int `json:"applications"` - TotalContainers int `json:"totalContainers"` - State string `json:"state"` - LastStateTransitionTime int64 `json:"lastStateTransitionTime"` + ClusterID string `json:"clusterId"` + Name string `json:"name"` + Capacity PartitionCapacity `json:"capacity"` + NodeSortingPolicy NodeSortingPolicy `json:"nodeSortingPolicy"` + TotalNodes int `json:"totalNodes"` + Applications map[string]int `json:"applications"` + TotalContainers int `json:"totalContainers"` + State string `json:"state"` + LastStateTransitionTime int64 `json:"lastStateTransitionTime"` } type PartitionCapacity struct { diff --git a/pkg/webservice/handlers.go b/pkg/webservice/handlers.go index 05684d7e2..f0d02a710 100644 --- a/pkg/webservice/handlers.go +++ b/pkg/webservice/handlers.go @@ -72,6 +72,16 @@ func getStackInfo(w http.ResponseWriter, r *http.Request) { } } +func getClusterInfo(w http.ResponseWriter, r *http.Request) { + writeHeaders(w) + + lists := schedulerContext.GetPartitionMapClone() + clustersInfo := getClusterDAO(lists) + if err := json.NewEncoder(w).Encode(clustersInfo); err != nil { + buildJSONErrorResponse(w, err.Error(), http.StatusInternalServerError) + } +} + func validateQueue(queuePath string) error { if queuePath != "" { queueNameArr := strings.Split(queuePath, ".") @@ -120,6 +130,16 @@ func buildJSONErrorResponse(w http.ResponseWriter, detail string, code int) { } } +func getClusterJSON(partition *scheduler.PartitionContext) *dao.ClusterDAOInfo { + clusterInfo := &dao.ClusterDAOInfo{} + clusterInfo.StartTime = schedulerContext.GetStartTime().UnixNano() + rmInfo := schedulerContext.GetRMInfoMapClone() + clusterInfo.RMBuildInformation = getRMBuildInformation(rmInfo) + clusterInfo.PartitionName = common.GetPartitionNameWithoutClusterID(partition.Name) + clusterInfo.ClusterName = "kubernetes" + return clusterInfo +} + func getClusterUtilJSON(partition *scheduler.PartitionContext) []*dao.ClusterUtilDAOInfo { var utils []*dao.ClusterUtilDAOInfo var getResource bool = true @@ -576,14 +596,8 @@ func getPartitionInfoDAO(lists map[string]*scheduler.PartitionContext) []*dao.Pa partitionInfo := &dao.PartitionInfo{} partitionInfo.ClusterID = partitionContext.RmID partitionInfo.Name = common.GetPartitionNameWithoutClusterID(partitionContext.Name) - - partitionInfo.StartTime = schedulerContext.GetStartTime().UnixNano() - rmInfo := schedulerContext.GetRMInfoMapClone() - partitionInfo.RMBuildInformation = getRMBuildInformation(rmInfo) - partitionInfo.State = partitionContext.GetCurrentState() partitionInfo.LastStateTransitionTime = partitionContext.GetStateTime().UnixNano() - capacityInfo := dao.PartitionCapacity{} capacity := partitionContext.GetTotalPartitionResource() usedCapacity := partitionContext.GetAllocatedResource() @@ -597,7 +611,6 @@ func getPartitionInfoDAO(lists map[string]*scheduler.PartitionContext) []*dao.Pa } partitionInfo.TotalNodes = partitionContext.GetTotalNodeCount() - appList := partitionContext.GetApplications() appList = append(appList, partitionContext.GetCompletedApplications()...) appList = append(appList, partitionContext.GetRejectedApplications()...) @@ -609,9 +622,7 @@ func getPartitionInfoDAO(lists map[string]*scheduler.PartitionContext) []*dao.Pa } applicationsState["total"] = totalApplications partitionInfo.Applications = applicationsState - partitionInfo.TotalContainers = partitionContext.GetTotalAllocationCount() - result = append(result, partitionInfo) } @@ -698,6 +709,16 @@ func getPartitionQueuesDAO(lists map[string]*scheduler.PartitionContext) []dao.P return result } +func getClusterDAO(lists map[string]*scheduler.PartitionContext) []*dao.ClusterDAOInfo { + var result []*dao.ClusterDAOInfo + + for _, partition := range lists { + result = append(result, getClusterJSON(partition)) + } + + return result +} + func getRMBuildInformation(lists map[string]*scheduler.RMInformation) []map[string]string { var result []map[string]string diff --git a/pkg/webservice/handlers_test.go b/pkg/webservice/handlers_test.go index 697e7de1f..e83da9230 100644 --- a/pkg/webservice/handlers_test.go +++ b/pkg/webservice/handlers_test.go @@ -760,6 +760,25 @@ func TestGetPartitionQueuesHandler(t *testing.T) { assertPartitionExists(t, resp) } +func TestGetClusterInfo(t *testing.T) { + setup(t, configTwoLevelQueues, 2) + + resp := &MockResponseWriter{} + getClusterInfo(resp, nil) + var data []*dao.ClusterDAOInfo + err := json.Unmarshal(resp.outputBytes, &data) + assert.NilError(t, err) + assert.Equal(t, 2, len(data)) + + cs := make(map[string]*dao.ClusterDAOInfo, 2) + for _, d := range data { + cs[d.PartitionName] = d + } + + assert.Assert(t, cs["default"] != nil) + assert.Assert(t, cs["gpu"] != nil) +} + func TestGetPartitionNodes(t *testing.T) { partition := setup(t, configDefault, 1) @@ -1328,6 +1347,7 @@ func verifyStateDumpJSON(t *testing.T, aggregated *AggregatedStateInfo) { assert.Check(t, aggregated.Timestamp != 0) assert.Check(t, len(aggregated.Partitions) > 0) assert.Check(t, len(aggregated.Nodes) > 0) + assert.Check(t, len(aggregated.ClusterInfo) > 0) assert.Check(t, len(aggregated.Queues) > 0) assert.Check(t, len(aggregated.LogLevel) > 0) } diff --git a/pkg/webservice/routes.go b/pkg/webservice/routes.go index 535af00da..a77eaac31 100644 --- a/pkg/webservice/routes.go +++ b/pkg/webservice/routes.go @@ -33,6 +33,14 @@ type route struct { type routes []route var webRoutes = routes{ + // endpoints to retrieve general cluster info + route{ + "Cluster", + "GET", + "/ws/v1/clusters", + getClusterInfo, + }, + // endpoint to retrieve goroutines info route{ "Scheduler", diff --git a/pkg/webservice/state_dump.go b/pkg/webservice/state_dump.go index a046c6355..722934845 100644 --- a/pkg/webservice/state_dump.go +++ b/pkg/webservice/state_dump.go @@ -42,6 +42,7 @@ type AggregatedStateInfo struct { Applications []*dao.ApplicationDAOInfo AppHistory []*dao.ApplicationHistoryDAOInfo Nodes []*dao.NodesDAOInfo + ClusterInfo []*dao.ClusterDAOInfo ContainerHistory []*dao.ContainerHistoryDAOInfo Queues []dao.PartitionQueueDAOInfo RMDiagnostics map[string]interface{} @@ -74,6 +75,7 @@ func doStateDump(w io.Writer) error { Applications: getApplicationsDAO(partitionContext), AppHistory: getAppHistoryDAO(records), Nodes: getNodesDAO(partitionContext), + ClusterInfo: getClusterDAO(partitionContext), ContainerHistory: getContainerHistoryDAO(records), Queues: getPartitionQueuesDAO(partitionContext), RMDiagnostics: getResourceManagerDiagnostics(),