From cf469c262545cf8dcf9307d9a2dfec2474772401 Mon Sep 17 00:00:00 2001 From: Huamin Chen Date: Thu, 5 Dec 2024 19:13:51 -0500 Subject: [PATCH] feat(sensor): support NVIDIA Grace Hopper Signed-off-by: Huamin Chen --- pkg/sensors/accelerator/devices/device.go | 4 +- pkg/sensors/accelerator/devices/grace_acpi.go | 227 ++++++++++++++++++ pkg/sensors/components/power.go | 7 + pkg/sensors/components/source/grace_acpi.go | 215 +++++++++++++++++ 4 files changed, 452 insertions(+), 1 deletion(-) create mode 100644 pkg/sensors/accelerator/devices/grace_acpi.go create mode 100644 pkg/sensors/components/source/grace_acpi.go diff --git a/pkg/sensors/accelerator/devices/device.go b/pkg/sensors/accelerator/devices/device.go index e3b6f84a35..e1e0057747 100644 --- a/pkg/sensors/accelerator/devices/device.go +++ b/pkg/sensors/accelerator/devices/device.go @@ -30,6 +30,7 @@ const ( HABANA DCGM NVML + GRACE ) var ( @@ -46,7 +47,7 @@ type ( ) func (d DeviceType) String() string { - return [...]string{"MOCK", "HABANA", "DCGM", "NVML"}[d] + return [...]string{"MOCK", "HABANA", "DCGM", "NVML", "GRACE HOPPER"}[d] } type Device interface { @@ -110,6 +111,7 @@ func registerDevices(r *Registry) { dcgmCheck(r) habanaCheck(r) nvmlCheck(r) + graceCheck(r) } func (r *Registry) MustRegister(a string, d DeviceType, deviceStartup deviceStartupFunc) { diff --git a/pkg/sensors/accelerator/devices/grace_acpi.go b/pkg/sensors/accelerator/devices/grace_acpi.go new file mode 100644 index 0000000000..cc41cd599b --- /dev/null +++ b/pkg/sensors/accelerator/devices/grace_acpi.go @@ -0,0 +1,227 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package devices + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/sustainable-computing-io/kepler/pkg/config" + "k8s.io/klog/v2" +) + +const ( + // Grace ACPI power paths and identifiers + graceHwmonPathTemplate = "/sys/class/hwmon/hwmon*/" + graceDevicePath = "device/" + gracePowerPrefix = "power1" + graceOemInfoFile = "_oem_info" + graceAverageFile = "_average" + + // Grace Hopper module power identifier + graceModuleLabel = "Module Power Socket" // Total CG1 module power (GPU+HBM) + + // Constants + microWattToMilliJoule = 1000 // Convert microwatts to mJ assuming 1 second sampling + graceHwType = config.GPU +) + +var ( + graceAccImpl = gpuGraceACPI{} + graceType DeviceType +) + +type gpuGraceACPI struct { + collectionSupported bool + modulePowerPaths map[int]string // Module power paths indexed by socket + currTime time.Time +} + +func graceCheck(r *Registry) { + if err := graceAccImpl.InitLib(); err != nil { + klog.V(5).Infof("Error initializing Grace GPU: %v", err) + return + } + graceType = GRACE + if err := addDeviceInterface(r, graceType, graceHwType, graceDeviceStartup); err == nil { + klog.Infof("Using %s to obtain Grace GPU power", graceAccImpl.Name()) + } else { + klog.V(5).Infof("Error registering Grace GPU: %v", err) + } +} + +func graceDeviceStartup() Device { + if err := graceAccImpl.Init(); err != nil { + klog.Errorf("failed to init Grace GPU device: %v", err) + return nil + } + return &graceAccImpl +} + +func (g *gpuGraceACPI) findModulePowerPaths() error { + g.modulePowerPaths = make(map[int]string) + + hwmonDirs, err := filepath.Glob(graceHwmonPathTemplate) + if err != nil { + return fmt.Errorf("failed to find hwmon directories: %v", err) + } + + for _, hwmonDir := range hwmonDirs { + deviceDir := hwmonDir + graceDevicePath + oemFile := deviceDir + gracePowerPrefix + graceOemInfoFile + data, err := os.ReadFile(oemFile) + if err != nil { + continue + } + label := strings.TrimSpace(string(data)) + + if !strings.HasPrefix(label, graceModuleLabel) { + continue + } + + socketNum := -1 + if strings.HasSuffix(label, "Socket 0") { + socketNum = 0 + } else if strings.HasSuffix(label, "Socket 1") { + socketNum = 1 + } else { + continue + } + + avgFile := deviceDir + gracePowerPrefix + graceAverageFile + g.modulePowerPaths[socketNum] = avgFile + } + + return nil +} + +func (g *gpuGraceACPI) readPowerFile(path string) (uint64, error) { + if path == "" { + return 0, fmt.Errorf("power path not initialized") + } + + data, err := os.ReadFile(path) + if err != nil { + return 0, fmt.Errorf("failed to read power file %s: %v", path, err) + } + + power, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64) + if err != nil { + return 0, fmt.Errorf("failed to parse power value: %v", err) + } + + now := time.Now() + if g.currTime.IsZero() { + g.currTime = now + return 0, nil + } + + diff := now.Sub(g.currTime) + seconds := diff.Seconds() + g.currTime = now + + energy := uint64(float64(power) * seconds / microWattToMilliJoule) + return energy, nil +} + +func (g *gpuGraceACPI) Name() string { + return graceType.String() +} + +func (g *gpuGraceACPI) DevType() DeviceType { + return graceType +} + +func (g *gpuGraceACPI) HwType() string { + return graceHwType +} + +func (g *gpuGraceACPI) InitLib() error { + return nil +} + +func (g *gpuGraceACPI) Init() error { + if err := g.findModulePowerPaths(); err != nil { + return err + } + g.collectionSupported = len(g.modulePowerPaths) > 0 + if g.collectionSupported { + klog.V(4).Infof("Detected Grace Hopper system with %d GPUs", len(g.modulePowerPaths)) + } + return nil +} + +func (g *gpuGraceACPI) IsDeviceCollectionSupported() bool { + return g.collectionSupported +} + +func (g *gpuGraceACPI) SetDeviceCollectionSupported(supported bool) { + g.collectionSupported = supported +} + +func (g *gpuGraceACPI) AbsEnergyFromDevice() []uint32 { + var energies []uint32 + for socketNum := 0; socketNum < len(g.modulePowerPaths); socketNum++ { + if path, ok := g.modulePowerPaths[socketNum]; ok { + energy, err := g.readPowerFile(path) + if err != nil { + klog.V(3).Infof("Failed to read GPU power for socket %d: %v", socketNum, err) + energies = append(energies, 0) + continue + } + energies = append(energies, uint32(energy)) + } + } + return energies +} + +func (g *gpuGraceACPI) DevicesByID() map[int]any { + devs := make(map[int]any) + for socketNum := range g.modulePowerPaths { + devs[socketNum] = GPUDevice{ + ID: socketNum, + IsSubdevice: false, + } + } + return devs +} + +func (g *gpuGraceACPI) DevicesByName() map[string]any { + return make(map[string]any) +} + +func (g *gpuGraceACPI) DeviceInstances() map[int]map[int]any { + return make(map[int]map[int]any) +} + +func (g *gpuGraceACPI) DeviceUtilizationStats(dev any) (map[any]any, error) { + return make(map[any]any), nil +} + +func (g *gpuGraceACPI) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]any, error) { + // Grace Hopper doesn't provide per-process GPU utilization through ACPI + return make(map[uint32]any), nil +} + +func (g *gpuGraceACPI) Shutdown() bool { + g.currTime = time.Time{} + return true +} diff --git a/pkg/sensors/components/power.go b/pkg/sensors/components/power.go index ad7932a0f2..28fe30b419 100644 --- a/pkg/sensors/components/power.go +++ b/pkg/sensors/components/power.go @@ -75,6 +75,13 @@ func InitPowerImpl() { return } + graceACPIImpl := &source.GraceACPI{} + if graceACPIImpl.IsSystemCollectionSupported() { + klog.V(1).Infoln("use NVIDIA Grace ACPI to obtain power") + powerImpl = graceACPIImpl + return + } + klog.V(1).Infoln("Unable to obtain power, use estimate method") estimateImpl := &source.PowerEstimate{} powerImpl = estimateImpl diff --git a/pkg/sensors/components/source/grace_acpi.go b/pkg/sensors/components/source/grace_acpi.go new file mode 100644 index 0000000000..f3d7d1b83b --- /dev/null +++ b/pkg/sensors/components/source/grace_acpi.go @@ -0,0 +1,215 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package source + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + "k8s.io/klog/v2" +) + +// Per https://docs.nvidia.com/grace-perf-tuning-guide/index.html#power-and-thermal-management +const ( + // Grace ACPI power paths and identifiers + graceHwmonPathTemplate = "/sys/class/hwmon/hwmon*/" + graceDevicePath = "device/" + gracePowerPrefix = "power1" + graceOemInfoFile = "_oem_info" + graceAverageFile = "_average" + + // Grace power component identifiers from hwmon + gracePowerLabel = "Grace Power Socket" // Total socket power including DRAM + graceCPULabel = "CPU Power Socket" // CPU rail power + graceSysIOLabel = "SysIO Power Socket" // SOC rail power + + // Conversion factors + microWattToMilliJoule = 1000 // Convert microwatts to mJ assuming 1 second sampling +) + +type socketPowerPaths struct { + totalPowerPath string // Grace Power Socket path + cpuPowerPath string // CPU Power Socket path +} + +type GraceACPI struct { + sockets map[int]*socketPowerPaths // Power paths per socket + currTime time.Time +} + +func (GraceACPI) GetName() string { + return "grace-acpi" +} + +// findPowerPathsByLabel searches through hwmon directories to find power measurement files +// and matches them with their corresponding OEM info labels +func (g *GraceACPI) findPowerPathsByLabel() error { + g.sockets = make(map[int]*socketPowerPaths) + + hwmonDirs, err := filepath.Glob(graceHwmonPathTemplate) + if err != nil { + return fmt.Errorf("failed to find hwmon directories: %v", err) + } + + for _, hwmonDir := range hwmonDirs { + deviceDir := hwmonDir + graceDevicePath + + // Check for power OEM info file + oemFile := deviceDir + gracePowerPrefix + graceOemInfoFile + data, err := os.ReadFile(oemFile) + if err != nil { + continue + } + label := strings.TrimSpace(string(data)) + + // Extract socket number and power type + // Per docs, Grace has 2 sockets, Grace Hopper has 1 CPU and 1 GPU socket + socketNum := -1 + if strings.HasSuffix(label, "Socket 0") { + socketNum = 0 + } else if strings.HasSuffix(label, "Socket 1") { + socketNum = 1 + } else { + continue + } + + // Initialize socket power paths if not exists + if g.sockets[socketNum] == nil { + g.sockets[socketNum] = &socketPowerPaths{} + } + + // Store the power measurement path based on label type + avgFile := deviceDir + gracePowerPrefix + graceAverageFile + if strings.HasPrefix(label, "Grace Power") { + g.sockets[socketNum].totalPowerPath = avgFile + } else if strings.HasPrefix(label, "CPU Power") { + g.sockets[socketNum].cpuPowerPath = avgFile + } + } + + if len(g.sockets) == 0 { + return fmt.Errorf("no Grace power measurement files found") + } + + klog.V(4).Infof("Detected Grace system with %d sockets", len(g.sockets)) + return nil +} + +// readPowerFile reads the power value from a given file path +func (g *GraceACPI) readPowerFile(path string) (uint64, error) { + if path == "" { + return 0, fmt.Errorf("power path not initialized") + } + + data, err := os.ReadFile(path) + if err != nil { + return 0, fmt.Errorf("failed to read power file %s: %v", path, err) + } + + // Power values are in microWatts + power, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64) + if err != nil { + return 0, fmt.Errorf("failed to parse power value: %v", err) + } + + now := time.Now() + if g.currTime.IsZero() { + g.currTime = now + return 0, nil + } + + // Calculate energy consumption over the time period + diff := now.Sub(g.currTime) + seconds := diff.Seconds() + g.currTime = now + + // Convert power to energy + energy := uint64(float64(power) * seconds / microWattToMilliJoule) + return energy, nil +} + +func (g *GraceACPI) Init() error { + return g.findPowerPathsByLabel() +} + +func (g *GraceACPI) IsSystemCollectionSupported() bool { + if err := g.Init(); err != nil { + klog.V(3).Infof("Grace ACPI power collection not supported: %v", err) + return false + } + return true +} + +// GetAbsEnergyFromCore returns the sum of CPU rail power across all sockets +func (g *GraceACPI) GetAbsEnergyFromCore() (uint64, error) { + var totalEnergy uint64 + for socketNum, paths := range g.sockets { + energy, err := g.readPowerFile(paths.cpuPowerPath) + if err != nil { + klog.V(3).Infof("Failed to read CPU power for socket %d: %v", socketNum, err) + continue + } + totalEnergy += energy + } + return totalEnergy, nil +} + +func (g *GraceACPI) GetAbsEnergyFromDram() (uint64, error) { + // DRAM power is included in total socket power but not separately measured + return 0, nil +} + +func (g *GraceACPI) GetAbsEnergyFromUncore() (uint64, error) { + return 0, nil +} + +func (g *GraceACPI) GetAbsEnergyFromPackage() (uint64, error) { + var totalEnergy uint64 + for socketNum, paths := range g.sockets { + energy, err := g.readPowerFile(paths.totalPowerPath) + if err != nil { + klog.V(3).Infof("Failed to read total power for socket %d: %v", socketNum, err) + continue + } + totalEnergy += energy + } + return totalEnergy, nil +} + +func (g *GraceACPI) GetAbsEnergyFromNodeComponents() map[int]NodeComponentsEnergy { + componentsEnergies := make(map[int]NodeComponentsEnergy) + + for socketNum, paths := range g.sockets { + pkgEnergy, _ := g.readPowerFile(paths.totalPowerPath) + coreEnergy, _ := g.readPowerFile(paths.cpuPowerPath) + + componentsEnergies[socketNum] = NodeComponentsEnergy{ + Core: coreEnergy, + Pkg: pkgEnergy, + // DRAM is included in package power + } + } + return componentsEnergies +} + +func (g *GraceACPI) StopPower() { + g.currTime = time.Time{} +}