From 66a96ba036cd44e95bccef34db9f6e094b1ffdd1 Mon Sep 17 00:00:00 2001 From: usaveh <72734623+usaveh@users.noreply.github.com> Date: Sun, 4 Aug 2024 21:51:52 +0800 Subject: [PATCH] =?UTF-8?q?feat(portal):=20=E4=BF=AE=E5=A4=8D=E9=97=A8?= =?UTF-8?q?=E6=88=B7=E4=B8=AD=E8=8A=82=E7=82=B9=E8=AE=A1=E6=95=B0=E9=87=8D?= =?UTF-8?q?=E5=A4=8D=E7=9A=84=E9=97=AE=E9=A2=98(=E8=BF=99=E4=B8=AA?= =?UTF-8?q?=E5=88=86=E6=94=AF=E8=A6=81=E7=AD=89=E9=80=82=E9=85=8D=E5=99=A8?= =?UTF-8?q?=E7=89=88=E6=9C=AC=E6=89=8D=E8=83=BD=E5=90=88=E5=B9=B6)=20(#137?= =?UTF-8?q?0)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 做了什么 ![image](https://github.com/user-attachments/assets/63544cfe-5cfc-468b-8446-70d3e8c7c670) ![image](https://github.com/user-attachments/assets/a6c89258-4233-40fe-b653-2d88c0de20b8) 节点会被重复计算在集群下的不同分区中 现在在集群和平台概览时会去除该重复节点 ![image](https://github.com/user-attachments/assets/8e29ff97-d2b7-4df5-a37b-0d6ceb8658ee) 门户中可用节点的翻译调整为->空闲节点 --------- Co-authored-by: Picca Sun --- .changeset/silver-pianos-clap.md | 7 + .changeset/six-shoes-refuse.md | 5 + apps/mis-server/src/bl/block.ts | 1 - apps/portal-server/src/services/config.ts | 22 +- apps/portal-web/src/apis/api.mock.ts | 207 +++++++++++------- apps/portal-web/src/apis/api.ts | 2 + apps/portal-web/src/i18n/zh_cn.ts | 2 +- .../api/dashboard/getClusterNodesInfo.ts | 75 +++++++ apps/portal-web/src/pages/dashboard.tsx | 185 +++++++++++----- dev/test-adapter/src/services/config.ts | 3 + libs/protos/scheduler-adapter/package.json | 2 +- protos/portal/config.proto | 37 ++++ 12 files changed, 410 insertions(+), 138 deletions(-) create mode 100644 .changeset/silver-pianos-clap.md create mode 100644 .changeset/six-shoes-refuse.md create mode 100644 apps/portal-web/src/pages/api/dashboard/getClusterNodesInfo.ts diff --git a/.changeset/silver-pianos-clap.md b/.changeset/silver-pianos-clap.md new file mode 100644 index 0000000000..e848d4dfab --- /dev/null +++ b/.changeset/silver-pianos-clap.md @@ -0,0 +1,7 @@ +--- +"@scow/scheduler-adapter-protos": minor +"@scow/portal-server": minor +"@scow/portal-web": patch +--- + +修复了门户系统中节点在不同集群中重复计数的问题 diff --git a/.changeset/six-shoes-refuse.md b/.changeset/six-shoes-refuse.md new file mode 100644 index 0000000000..9a716721b7 --- /dev/null +++ b/.changeset/six-shoes-refuse.md @@ -0,0 +1,5 @@ +--- +"@scow/portal-server": minor +--- + +新增 getClusterNodesInfo 接口,用于获取集群分区节点信息。 diff --git a/apps/mis-server/src/bl/block.ts b/apps/mis-server/src/bl/block.ts index c135ad9987..fb90283eda 100644 --- a/apps/mis-server/src/bl/block.ts +++ b/apps/mis-server/src/bl/block.ts @@ -290,4 +290,3 @@ export async function unblockUserInAccount( accountName, userId, }, logger); } - diff --git a/apps/portal-server/src/services/config.ts b/apps/portal-server/src/services/config.ts index 9688380ea9..2540db97f6 100644 --- a/apps/portal-server/src/services/config.ts +++ b/apps/portal-server/src/services/config.ts @@ -67,7 +67,6 @@ export const staticConfigServiceServer = plugin((server) => { return [{ partitions: availablePartitions }]; }, - getClusterConfigFiles: async ({ logger }) => { const clusterConfigs = getClusterConfigs(undefined, logger, ["hpc"]); @@ -112,11 +111,30 @@ export const runtimeConfigServiceServer = plugin((server) => { const minRequiredApiVersion: ApiVersion = { major: 1, minor: 4, patch: 0 }; // 检验调度器的API版本是否符合要求,不符合要求报错 await checkSchedulerApiVersion(client, minRequiredApiVersion); - return await asyncClientCall(client.config, "getClusterInfo", {}); + return await asyncClientCall(client.config, "getClusterInfo", request); }, ); return [reply]; }, + + getClusterNodesInfo: async ({ request, logger }) => { + const { nodeNames,cluster } = request; + + const reply = await callOnOne( + cluster, + logger, + async (client) => { + // 当前接口要求的最低调度器接口版本 + const minRequiredApiVersion: ApiVersion = { major: 1, minor: 6, patch: 0 }; + // 检验调度器的API版本是否符合要求,不符合要求报错 + await checkSchedulerApiVersion(client, minRequiredApiVersion); + return await asyncClientCall(client.config, "getClusterNodesInfo", { + nodeNames: nodeNames || [], + }); + }, + ); + return [{ nodes: reply.nodes }]; + }, }); }); diff --git a/apps/portal-web/src/apis/api.mock.ts b/apps/portal-web/src/apis/api.mock.ts index 704413cc29..21bd57797c 100644 --- a/apps/portal-web/src/apis/api.mock.ts +++ b/apps/portal-web/src/apis/api.mock.ts @@ -63,45 +63,46 @@ export const job: JobInfo = { }; export const mockApi: MockApi = { - - getQuickEntries:async () => ({ quickEntries: [ - { - id:"submitJob", - name:"submitJob", - entry:{ - $case:"pageLink", - pageLink:{ - path: "/jobs/submit", - icon:"PlusCircleOutlined", + getQuickEntries: async () => ({ + quickEntries: [ + { + id: "submitJob", + name: "submitJob", + entry: { + $case: "pageLink", + pageLink: { + path: "/jobs/submit", + icon: "PlusCircleOutlined", + }, }, }, - }, - { - id:"runningJob", - name:"runningJobs", - entry:{ - $case:"pageLink", - pageLink:{ - path: "/jobs/runningJobs", - icon:"BookOutlined", + { + id: "runningJob", + name: "runningJobs", + entry: { + $case: "pageLink", + pageLink: { + path: "/jobs/runningJobs", + icon: "BookOutlined", + }, }, }, - }, - { - id:"allJobs", - name:"allJobs", - entry:{ - $case:"pageLink", - pageLink:{ - path: "/jobs/allJobs", - icon:"BookOutlined", + { + id: "allJobs", + name: "allJobs", + entry: { + $case: "pageLink", + pageLink: { + path: "/jobs/allJobs", + icon: "BookOutlined", + }, }, }, - }, - ]}), - saveQuickEntries:null, + ], + }), + saveQuickEntries: null, getClusterInfo: null, - getClusterRunningInfo:null, + getClusterRunningInfo: null, listAvailableTransferClusters: null, checkAppConnectivity: async () => ({ @@ -112,7 +113,7 @@ export const mockApi: MockApi = { listAvailableApps: async () => ({ apps: [ - { id: "vscode", name: "VSCode", logoPath:"/apps/VSCode.svg" }, + { id: "vscode", name: "VSCode", logoPath: "/apps/VSCode.svg" }, { id: "emacs", name: "Emacs" }, { id: "jupyter", name: "jupyter" }, ], @@ -141,33 +142,50 @@ export const mockApi: MockApi = { wms: [{ name: "cinnamon", wm: "Cinnamon" }, { name: "gnome", wm: "GNOME" }], }), - getAppSessions: async () => ({ sessions: [ - { jobId: 100, sessionId: "123", appId: "vscode", appName:"vscode", state: "PENDING", reason: "resource", - submitTime: new Date().toISOString(), host: "192.168.88.100", port: 1000, dataPath: "/test", - timeLimit: "01:00:00", runningTime: "" }, - { jobId: 101, sessionId: "124", appId: "vscode", appName:"vscode", state: "RUNNING", - submitTime: new Date().toISOString(), dataPath: "/test", - timeLimit: "1-01:00:00", runningTime: "01:50" }, - { jobId: 102, sessionId: "125", appId: "vscode", appName:"vscode", state: "RUNNING", - submitTime: new Date().toISOString(), host: "192.168.88.100", port: 10000, dataPath: "/test", - timeLimit: "INVALID", runningTime: "01:55" }, - ]}), + getAppSessions: async () => ({ + sessions: [ + { + jobId: 100, sessionId: "123", appId: "vscode", appName: "vscode", state: "PENDING", reason: "resource", + submitTime: new Date().toISOString(), host: "192.168.88.100", port: 1000, dataPath: "/test", + timeLimit: "01:00:00", runningTime: "", + }, + { + jobId: 101, sessionId: "124", appId: "vscode", appName: "vscode", state: "RUNNING", + submitTime: new Date().toISOString(), dataPath: "/test", + timeLimit: "1-01:00:00", runningTime: "01:50", + }, + { + jobId: 102, sessionId: "125", appId: "vscode", appName: "vscode", state: "RUNNING", + submitTime: new Date().toISOString(), host: "192.168.88.100", port: 10000, dataPath: "/test", + timeLimit: "INVALID", runningTime: "01:55", + }, + ], + }), getAppMetadata: async () => ({ appName: "test", appCustomFormAttributes: [ - { type: "NUMBER", label: "版本", name: "version", required: false, - placeholder: "选择版本", defaultValue: 123, select: []}, - { type: "TEXT", label: "文字", name: "text", required: false, - placeholder: "提示信息", defaultValue: 555, select: []}, - { type: "TEXT", label: "其他sbatch参数", name: "sbatchOptions", - required: true, placeholder: "比如:--gpus gres:2 --time 10", select: []}, - { type: "SELECT", label: "选项", name: "option", required: false, + { + type: "NUMBER", label: "版本", name: "version", required: false, + placeholder: "选择版本", defaultValue: 123, select: [], + }, + { + type: "TEXT", label: "文字", name: "text", required: false, + placeholder: "提示信息", defaultValue: 555, select: [], + }, + { + type: "TEXT", label: "其他sbatch参数", name: "sbatchOptions", + required: true, placeholder: "比如:--gpus gres:2 --time 10", select: [], + }, + { + type: "SELECT", label: "选项", name: "option", required: false, placeholder: "提示信息", defaultValue: "version2", select: [ { label: "版本1", value: "version1" }, { label: "版本2", value: "version2" }, - ]}, - ]}), + ], + }, + ], + }), connectToApp: async ({ body: { sessionId } }) => sessionId === "124" ? { @@ -183,8 +201,8 @@ export const mockApi: MockApi = { } : { host: "127.0.0.1", port: 3000, password: "123", type: "vnc", - } - , + }, + getJobTemplate: async () => ({ template: { @@ -199,16 +217,18 @@ export const mockApi: MockApi = { output: "job.%j.out", errorOutput: "job.%j.err", workingDirectory: "/nfs/jobs/123", - maxTimeUnit: TimeUnit.MINUTES, + maxTimeUnit: TimeUnit.MINUTES, }, }), - listJobTemplates: async () => ({ results: [{ - id: "123-sample-apple", - comment: "1234", - submitTime: new Date().toString(), - jobName: "sample-apple", - }]}), + listJobTemplates: async () => ({ + results: [{ + id: "123-sample-apple", + comment: "1234", + submitTime: new Date().toString(), + jobName: "sample-apple", + }], + }), deleteJobTemplate: async () => null, @@ -277,30 +297,49 @@ export const mockApi: MockApi = { checkTransferKey: null, getAvailablePartitionsForCluster: async () => ({ partitions: []}), - getClusterConfigFiles: async () => ({ clusterConfigs: { - hpc01: { - displayName: "hpc01Name", - priority: 1, - adapterUrl: "0.0.0.0:0000", - proxyGateway: undefined, - loginNodes: [{ "address": "localhost:22222", "name": "login" }], - loginDesktop: undefined, - turboVncPath: undefined, - crossClusterFileTransfer: undefined, - hpc: { enabled: true }, - ai: { enabled: false }, - k8s: undefined, + getClusterConfigFiles: async () => ({ + clusterConfigs: { + hpc01: { + displayName: "hpc01Name", + priority: 1, + adapterUrl: "0.0.0.0:0000", + proxyGateway: undefined, + loginNodes: [{ "address": "localhost:22222", "name": "login" }], + loginDesktop: undefined, + turboVncPath: undefined, + crossClusterFileTransfer: undefined, + hpc: { enabled: true }, + ai: { enabled: false }, + k8s: undefined, + }, }, - } }), - - getClustersRuntimeInfo: async () => ({ results: [{ - clusterId: "hpc01", - activationStatus: ClusterActivationStatus.ACTIVATED, - operatorId: undefined, - operatorName: undefined, - comment: "", - }]}), + }), + getClustersRuntimeInfo: async () => ({ + results: [{ + clusterId: "hpc01", + activationStatus: ClusterActivationStatus.ACTIVATED, + operatorId: undefined, + operatorName: undefined, + comment: "", + }], + }), + getClusterNodesInfo: async () => ({ + nodeInfo: [{ + gpuCount: 1, + state: 1, + partitions: ["linux","compute"], + cpuCoreCount: 1, + idleGpuCount: 1, + nodeName: "h1", + allocCpuCoreCount: 1, + idleCpuCoreCount: 1, + totalMemMb: 0.23, + allocMemMb: 0.32, + idleMemMb: 0.5, + allocGpuCount: 0.5, + }], + }), }; diff --git a/apps/portal-web/src/apis/api.ts b/apps/portal-web/src/apis/api.ts index 369f2f6e04..feaf15a167 100644 --- a/apps/portal-web/src/apis/api.ts +++ b/apps/portal-web/src/apis/api.ts @@ -27,6 +27,7 @@ import type { AuthCallbackSchema } from "src/pages/api/auth/callback"; import type { LogoutSchema } from "src/pages/api/auth/logout"; import type { ValidateTokenSchema } from "src/pages/api/auth/validateToken"; import type { GetClusterRunningInfoSchema } from "src/pages/api/dashboard/getClusterInfo"; +import type { GetClusterNodesInfoSchema } from "src/pages/api/dashboard/getClusterNodesInfo"; import type { GetQuickEntriesSchema } from "src/pages/api/dashboard/getQuickEntries"; import type { SaveQuickEntriesSchema } from "src/pages/api/dashboard/saveQuickEntries"; import type { CreateDesktopSchema } from "src/pages/api/desktop/createDesktop"; @@ -79,6 +80,7 @@ export const api = { validateToken: apiClient.fromTypeboxRoute("GET", "/api/auth/validateToken"), getClusterInfo: apiClient.fromTypeboxRoute("GET", "/api//cluster"), getClusterRunningInfo: apiClient.fromTypeboxRoute("GET", "/api/dashboard/getClusterInfo"), + getClusterNodesInfo: apiClient.fromTypeboxRoute("GET", "/api/dashboard/getClusterNodesInfo"), getQuickEntries: apiClient.fromTypeboxRoute("GET", "/api/dashboard/getQuickEntries"), saveQuickEntries: apiClient.fromTypeboxRoute("POST", "/api/dashboard/saveQuickEntries"), createDesktop: apiClient.fromTypeboxRoute("POST", "/api/desktop/createDesktop"), diff --git a/apps/portal-web/src/i18n/zh_cn.ts b/apps/portal-web/src/i18n/zh_cn.ts index 481c1c1c40..3f94eb3303 100644 --- a/apps/portal-web/src/i18n/zh_cn.ts +++ b/apps/portal-web/src/i18n/zh_cn.ts @@ -449,7 +449,7 @@ export default { resourceInfo:"资源信息", core:"核", running:"运行中", - idle:"可用", + idle:"空闲", notAvailable:"不可用", card:"卡", job:"作业", diff --git a/apps/portal-web/src/pages/api/dashboard/getClusterNodesInfo.ts b/apps/portal-web/src/pages/api/dashboard/getClusterNodesInfo.ts new file mode 100644 index 0000000000..f8aaf85513 --- /dev/null +++ b/apps/portal-web/src/pages/api/dashboard/getClusterNodesInfo.ts @@ -0,0 +1,75 @@ +/** + * Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy + * SCOW is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + */ + +import { typeboxRouteSchema } from "@ddadaal/next-typed-api-routes-runtime"; +import { asyncUnaryCall } from "@ddadaal/tsgrpc-client"; +import { ConfigServiceClient } from "@scow/protos/build/portal/config"; +import { NodeInfo_NodeState } from "@scow/protos/build/portal/config"; +import { Static, Type } from "@sinclair/typebox"; +import { authenticate } from "src/auth/server"; +import { getClient } from "src/utils/client"; +import { route } from "src/utils/route"; + +export const NodeInfo = Type.Object({ + nodeName: Type.String(), + partitions: Type.Array(Type.String()), + state: Type.Enum(NodeInfo_NodeState), + cpuCoreCount: Type.Number(), + allocCpuCoreCount: Type.Number(), + idleCpuCoreCount: Type.Number(), + totalMemMb: Type.Number(), + allocMemMb: Type.Number(), + idleMemMb: Type.Number(), + gpuCount: Type.Number(), + allocGpuCount: Type.Number(), + idleGpuCount: Type.Number(), +}); + +export type NodeInfo = Static; + +export const GetClusterNodesInfoSchema = typeboxRouteSchema({ + method: "GET", + + query: Type.Object({ + nodeNames: Type.Optional(Type.Array(Type.String())), + cluster :Type.String(), + }), + + responses: { + 200: Type.Object({ + nodeInfo: Type.Array(NodeInfo), + }), + + 403: Type.Null(), + }, +}); + +const auth = authenticate(() => true); + +export default route(GetClusterNodesInfoSchema, async (req, res) => { + + const info = await auth(req, res); + + if (!info) { return; } + + const { cluster,nodeNames } = req.query; + + const client = getClient(ConfigServiceClient); + + const reply = await asyncUnaryCall(client, "getClusterNodesInfo", { + cluster, + nodeNames: nodeNames || [], + }); + + return { 200: { nodeInfo: reply.nodes } }; + +}); diff --git a/apps/portal-web/src/pages/dashboard.tsx b/apps/portal-web/src/pages/dashboard.tsx index 50184a0285..2f7e1a2ac6 100644 --- a/apps/portal-web/src/pages/dashboard.tsx +++ b/apps/portal-web/src/pages/dashboard.tsx @@ -11,6 +11,7 @@ */ import { PartitionInfo } from "@scow/protos/build/portal/config"; +import { NodeInfo } from "@scow/protos/build/portal/config"; import { NextPage } from "next"; import { useRouter } from "next/router"; import { useCallback, useEffect } from "react"; @@ -27,18 +28,17 @@ import { UserStore } from "src/stores/UserStore"; import { Head } from "src/utils/head"; import { styled } from "styled-components"; - - -interface Props { -} +interface Props {} interface FulfilledResult { - clusterInfo: { clusterName: string, partitions: PartitionInfo[] } + clusterInfo: { clusterName: string; partitions: PartitionInfo[] }; } +interface FulfilledNodesResult { + nodeInfo: { clusterName: string; nodes: NodeInfo[] }; +} export const DashboardPage: NextPage = requireAuth(() => true)(() => { - const userStore = useStore(UserStore); const router = useRouter(); @@ -52,26 +52,54 @@ export const DashboardPage: NextPage = requireAuth(() => true)(() => { const { data, isLoading } = useAsync({ promiseFn: useCallback(async () => { - const rawClusterInfoPromises = currentClusters.map((x) => - api.getClusterRunningInfo({ query: { clusterId: x.id } }) + api + .getClusterRunningInfo({ query: { clusterId: x.id } }) .httpError(500, () => {}), ); + const rawClusterInfoResults = await Promise.allSettled(rawClusterInfoPromises); + const rawClusterNodesInfoPromises = currentClusters.map((x) => + api + .getClusterNodesInfo({ query: { cluster: x.id } }) + .httpError(500, () => {}), + ); - const rawClusterInfoResults = await Promise.allSettled(rawClusterInfoPromises); + const rawClusterNodesInfoResults = await Promise.allSettled(rawClusterNodesInfoPromises); + + const successfulNodesResults = rawClusterNodesInfoResults + .map((result, idx) => { + if (result.status === "fulfilled") { + return { + ...result, + value: { + nodeInfo: { + clusterName: currentClusters[idx].id, + nodes: result.value.nodeInfo, + }, + }, + } as PromiseSettledResult; + } + + return result; + }) + .filter( + (result): result is PromiseFulfilledResult => + result.status === "fulfilled", + ) + .map((result) => result.value); - // 处理成功的结果 const successfulResults = rawClusterInfoResults - // 替换clusterId,适配器返回的clusterName和SCOW配置文件中的clusterId没关系 .map((result, idx) => { if (result.status === "fulfilled") { return { ...result, - value:{ - clusterInfo:{ clusterName: currentClusters[idx].id, - partitions:result.value.clusterInfo.partitions }, + value: { + clusterInfo: { + clusterName: currentClusters[idx].id, + partitions: result.value.clusterInfo.partitions, + }, }, } as PromiseSettledResult; } @@ -80,20 +108,34 @@ export const DashboardPage: NextPage = requireAuth(() => true)(() => { }) .filter( (result): result is PromiseFulfilledResult => - result.status === "fulfilled") + result.status === "fulfilled", + ) .map((result) => result.value); - // 处理失败的结果 - const failedClusters = currentClusters.filter((x) => - !successfulResults.find((y) => y.clusterInfo.clusterName === x.id), + const failedClusters = currentClusters.filter( + (x) => !successfulResults.find((y) => y.clusterInfo.clusterName === x.id), ); - // 成功的集群名称 const successfulClusters = currentClusters.filter((x) => successfulResults.find((y) => y.clusterInfo.clusterName === x.id), ); + const nodeCountsByPartition: Record> = {}; + successfulNodesResults.forEach(({ nodeInfo }) => { + nodeInfo.nodes.forEach((node) => { + node.partitions.forEach((partition) => { + if (!nodeCountsByPartition[node.nodeName]) { + nodeCountsByPartition[node.nodeName] = {}; + } + if (!nodeCountsByPartition[node.nodeName][partition]) { + nodeCountsByPartition[node.nodeName][partition] = 0; + } + nodeCountsByPartition[node.nodeName][partition]++; + }); + }); + }); + const clustersInfo = successfulResults .map((cluster) => ({ clusterInfo: { @@ -106,34 +148,35 @@ export const DashboardPage: NextPage = requireAuth(() => true)(() => { clusterName: cluster.clusterInfo.clusterName, ...x, cpuUsage: ((x.runningCpuCount / x.cpuCoreCount) * 100).toFixed(2), - gpuUsage: x.gpuCoreCount ? ((x.runningGpuCount / x.gpuCoreCount) * 100).toFixed(2) : undefined, + gpuUsage: x.gpuCoreCount + ? ((x.runningGpuCount / x.gpuCoreCount) * 100).toFixed(2) + : undefined, })), ); - // 平台概览信息 const platformOverview: PlatformOverview = { - nodeCount:0, - runningNodeCount:0, - idleNodeCount:0, - notAvailableNodeCount:0, - cpuCoreCount:0, - runningCpuCount:0, - idleCpuCount:0, - notAvailableCpuCount:0, - gpuCoreCount:0, - runningGpuCount:0, - idleGpuCount:0, - notAvailableGpuCount:0, - jobCount:0, - runningJobCount:0, - pendingJobCount:0, - usageRatePercentage:0, - partitionStatus:0, + nodeCount: 0, + runningNodeCount: 0, + idleNodeCount: 0, + notAvailableNodeCount: 0, + cpuCoreCount: 0, + runningCpuCount: 0, + idleCpuCount: 0, + notAvailableCpuCount: 0, + gpuCoreCount: 0, + runningGpuCount: 0, + idleGpuCount: 0, + notAvailableGpuCount: 0, + jobCount: 0, + runningJobCount: 0, + pendingJobCount: 0, + usageRatePercentage: 0, + partitionStatus: 0, }; - // 各个集群概览信息 const clustersOverview: ClusterOverview[] = []; successfulResults.forEach((result) => { + const { clusterName, partitions } = result.clusterInfo; const aggregatedData = partitions.reduce( @@ -178,7 +221,47 @@ export const DashboardPage: NextPage = requireAuth(() => true)(() => { }, ); - // 累加平台概览信息 + + // 真实的节点数 + const realNode = successfulNodesResults. + find((v) => v.nodeInfo.clusterName === clusterName)?.nodeInfo.nodes; + + if (realNode) { + aggregatedData.runningNodeCount = realNode.filter((v) => v.state === 2).length; + aggregatedData.notAvailableNodeCount = realNode.filter((v) => v.state === 3).length; + aggregatedData.idleNodeCount = realNode.filter((v) => v.state === 1).length; + } + + if (realNode && ((realNode?.length ?? -1) < aggregatedData.nodeCount)) { + aggregatedData.nodeCount = realNode.length; + const duplicateNodes: NodeInfo[] = []; + // 找到被重复计算的节点 + Object.keys(nodeCountsByPartition).forEach((nodeName) => { + const nodeCountInPartitions = nodeCountsByPartition[nodeName]; + if (Object.keys(nodeCountInPartitions).length > 1) { + const duplicateNode = successfulNodesResults.find((v) => v.nodeInfo.clusterName === clusterName) + ?.nodeInfo.nodes.find((v) => v.nodeName === nodeName); + if (duplicateNode) { + duplicateNodes.push(duplicateNode); + } + } + }); + // 去除被重复计算的节点 + duplicateNodes.forEach((duplicateNode) => { + const count = duplicateNode.partitions.length - 1; + aggregatedData.cpuCoreCount -= count * (duplicateNode?.cpuCoreCount ?? 0); + aggregatedData.runningCpuCount -= count * (duplicateNode?.allocCpuCoreCount ?? 0); + aggregatedData.idleCpuCount -= count * (duplicateNode?.idleCpuCoreCount ?? 0); + aggregatedData.gpuCoreCount -= count * (duplicateNode?.gpuCount ?? 0); + aggregatedData.runningGpuCount -= count * (duplicateNode?.allocGpuCount ?? 0); + aggregatedData.idleGpuCount -= count * (duplicateNode?.idleGpuCount ?? 0); + aggregatedData.notAvailableCpuCount -= aggregatedData.cpuCoreCount - + (aggregatedData.runningCpuCount + aggregatedData.idleCpuCount); + aggregatedData.notAvailableGpuCount -= aggregatedData.gpuCoreCount - + (aggregatedData.runningGpuCount + aggregatedData.idleGpuCount); + }); + } + platformOverview.nodeCount += aggregatedData.nodeCount; platformOverview.runningNodeCount += aggregatedData.runningNodeCount; platformOverview.idleNodeCount += aggregatedData.idleNodeCount; @@ -196,14 +279,16 @@ export const DashboardPage: NextPage = requireAuth(() => true)(() => { platformOverview.pendingJobCount += aggregatedData.pendingJobCount; platformOverview.partitionStatus += aggregatedData.partitionStatus; - aggregatedData.usageRatePercentage = - Number(((aggregatedData.runningNodeCount / aggregatedData.nodeCount) * 100).toFixed(2)); + aggregatedData.usageRatePercentage = Number( + ((aggregatedData.runningNodeCount / aggregatedData.nodeCount) * 100).toFixed(2), + ); clustersOverview.push(aggregatedData); }); - platformOverview.usageRatePercentage = - Number(((platformOverview.runningNodeCount / platformOverview.nodeCount) * 100).toFixed(2)); + platformOverview.usageRatePercentage = Number( + ((platformOverview.runningNodeCount / platformOverview.nodeCount) * 100).toFixed(2), + ); return { clustersInfo, @@ -218,21 +303,23 @@ export const DashboardPage: NextPage = requireAuth(() => true)(() => { return ( - + ({ ...item, id:idx })) : []} + clusterInfo={data?.clustersInfo ? data.clustersInfo.map((item, idx) => ({ ...item, id: idx })) : []} failedClusters={data?.failedClusters ?? []} currentClusters={currentClusters} clustersOverview={data?.clustersOverview ?? []} - platformOverview={data?.platformOverview } + platformOverview={data?.platformOverview} successfulClusters={data?.successfulClusters} /> ); }); -const DashboardPageContent = styled.div` -`; +const DashboardPageContent = styled.div``; export default DashboardPage; diff --git a/dev/test-adapter/src/services/config.ts b/dev/test-adapter/src/services/config.ts index 8dacf4cccc..5a967cc116 100644 --- a/dev/test-adapter/src/services/config.ts +++ b/dev/test-adapter/src/services/config.ts @@ -54,5 +54,8 @@ export const configServiceServer = plugin((server) => { getClusterInfo:async () => { return []; }, + getClusterNodesInfo:async () => { + return []; + }, }); }); diff --git a/libs/protos/scheduler-adapter/package.json b/libs/protos/scheduler-adapter/package.json index dc09f8fe56..97a3d1c8f9 100644 --- a/libs/protos/scheduler-adapter/package.json +++ b/libs/protos/scheduler-adapter/package.json @@ -5,7 +5,7 @@ "main": "build/index.js", "private": true, "scripts": { - "generate": "rimraf generated && buf generate --template buf.gen.yaml https://github.com/PKUHPC/scow-scheduler-adapter-interface.git#branch=v1.5.0", + "generate": "rimraf generated && buf generate --template buf.gen.yaml https://github.com/PKUHPC/scow-scheduler-adapter-interface.git#branch=v1.6.0", "build": "rimraf build && tsc" }, "files": [ diff --git a/protos/portal/config.proto b/protos/portal/config.proto index 34011bfb59..3459a9a669 100644 --- a/protos/portal/config.proto +++ b/protos/portal/config.proto @@ -52,6 +52,43 @@ message GetClusterInfoResponse { repeated PartitionInfo partitions = 2; } +message NodeInfo { + + enum NodeState { + UNKNOWN = 0; + IDLE = 1; + RUNNING = 2; + NOT_AVAILABLE = 3; + } + + string node_name = 1; + repeated string partitions = 2; + NodeState state = 3; + uint32 cpu_core_count = 4; + uint32 alloc_cpu_core_count = 5; + uint32 idle_cpu_core_count = 6; + uint32 total_mem_mb = 7; + uint32 alloc_mem_mb = 8; + uint32 idle_mem_mb = 9; + uint32 gpu_count = 10; + uint32 alloc_gpu_count = 11; + uint32 idle_gpu_count = 12; +} + + +message GetClusterNodesInfoRequest { + // if the value of node_names = [], request all nodes info + string cluster = 1; + repeated string node_names = 2; +} + +message GetClusterNodesInfoResponse { + repeated NodeInfo nodes = 1; +} + service ConfigService { + rpc GetClusterInfo(GetClusterInfoRequest) returns (GetClusterInfoResponse); + + rpc GetClusterNodesInfo(GetClusterNodesInfoRequest) returns (GetClusterNodesInfoResponse); }