Skip to content

Commit

Permalink
Merge pull request #1704 from maryamtahhan/cleanup-gpu
Browse files Browse the repository at this point in the history
chore: gpu cleanup
  • Loading branch information
sthaha authored Oct 28, 2024
2 parents 0f5c40e + bb1b4ef commit 2c42b18
Show file tree
Hide file tree
Showing 24 changed files with 489 additions and 450 deletions.
8 changes: 1 addition & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,8 @@ CTR_CMD_PUSH_OPTIONS ?=

GENERAL_TAGS := include_gcs include_oss containers_image_openpgp gssapi providerless netgo osusergo
GPU_TAGS :=
ifeq ($(shell ldconfig -p | grep -q libnvml_injection.so && echo exists),exists)
GPU_TAGS := nvml
endif
ifeq ($(shell ldconfig -p | grep -q libdcgm.so && echo exists),exists)
GPU_TAGS := dcgm
endif
ifeq ($(shell ldconfig -p | grep -q libhlml.so && echo exists),exists)
GPU_TAGS := habana
GPU_TAGS := habana
endif

# set GOENV
Expand Down
2 changes: 1 addition & 1 deletion cmd/exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ func main() {

if config.EnabledGPU() {
r := accelerator.GetRegistry()
if a, err := accelerator.New(accelerator.GPU, true); err == nil {
if a, err := accelerator.New(config.GPU, true); err == nil {
r.MustRegister(a) // Register the accelerator with the registry
} else {
klog.Errorf("failed to init GPU accelerators: %v", err)
Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/energy/node_energy_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func UpdateNodeComponentsEnergy(nodeStats *stats.NodeStats, wg *sync.WaitGroup)
func UpdateNodeGPUEnergy(nodeStats *stats.NodeStats, wg *sync.WaitGroup) {
defer wg.Done()
if config.EnabledGPU() {
if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil {
if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil {
gpuEnergy := gpu.Device().AbsEnergyFromDevice()
for gpu, energy := range gpuEnergy {
nodeStats.EnergyUsage[config.AbsEnergyInGPU].SetDeltaStat(fmt.Sprintf("%d", gpu), uint64(energy))
Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/metric_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ func (c *Collector) updateProcessResourceUtilizationMetrics(wg *sync.WaitGroup)
// we first updates the bpf which is responsible to include new processes in the ProcessStats collection
resourceBpf.UpdateProcessBPFMetrics(c.bpfExporter, c.ProcessStats)
if config.EnabledGPU() {
if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil {
if acc.GetActiveAcceleratorByType(config.GPU) != nil {
accelerator.UpdateProcessGPUUtilizationMetrics(c.ProcessStats)
}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/metric_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import (
)

func newMockCollector(mockAttacher bpf.Exporter) *Collector {
if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil {
if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil {
d := gpu.Device()
err := d.Init() // create structure instances that will be accessed to create a containerMetric
Expect(err).NotTo(HaveOccurred())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import (
"github.com/sustainable-computing-io/kepler/pkg/config"
"github.com/sustainable-computing-io/kepler/pkg/libvirt"
acc "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator"
dev "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device"
dev "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/devices"
"k8s.io/klog/v2"

"github.com/sustainable-computing-io/kepler/pkg/utils"
Expand All @@ -43,7 +43,7 @@ var (

// UpdateProcessGPUUtilizationMetrics reads the GPU metrics of each process using the GPU
func UpdateProcessGPUUtilizationMetrics(processStats map[uint64]*stats.ProcessStats) {
if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil {
if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil {
d := gpu.Device()
migDevices := d.DeviceInstances()
for _, _device := range d.DevicesByID() {
Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/stats/node_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ func (ne *NodeStats) ResetDeltaValues() {
func (ne *NodeStats) UpdateIdleEnergyWithMinValue(isComponentsSystemCollectionSupported bool) {
// gpu metric
if config.EnabledGPU() {
if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil {
if acc.GetActiveAcceleratorByType(config.GPU) != nil {
ne.CalcIdleEnergy(config.AbsEnergyInGPU, config.IdleEnergyInGPU, config.GPUComputeUtilization)
}
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/collector/stats/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ func NewStats() *Stats {
}

if config.EnabledGPU() {
if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil {
if acc.GetActiveAcceleratorByType(config.GPU) != nil {
stats.ResourceUsage[config.GPUComputeUtilization] = types.NewUInt64StatCollection()
stats.ResourceUsage[config.GPUMemUtilization] = types.NewUInt64StatCollection()
stats.ResourceUsage[config.IdleEnergyInGPU] = types.NewUInt64StatCollection()
Expand Down Expand Up @@ -141,7 +141,7 @@ func (s *Stats) UpdateDynEnergy() {
}
// GPU metric
if config.EnabledGPU() {
if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil {
if acc.GetActiveAcceleratorByType(config.GPU) != nil {
for gpuID := range s.EnergyUsage[config.AbsEnergyInGPU] {
s.CalcDynEnergy(config.AbsEnergyInGPU, config.IdleEnergyInGPU, config.DynEnergyInGPU, gpuID)
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/stats/test_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ const (
// TODO: do not use a fixed usageMetric array in the power models, a structured data is more disarable.
func SetMockedCollectorMetrics() {
config.GetConfig()
if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil {
if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil {
err := gpu.Device().Init() // create structure instances that will be accessed to create a processMetric
klog.Fatalln(err)
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/stats/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ func GetProcessFeatureNames() []string {

// gpu metric
if config.EnabledGPU() {
if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil {
if acc.GetActiveAcceleratorByType(config.GPU) != nil {
gpuMetrics := []string{config.GPUComputeUtilization, config.GPUMemUtilization}
metrics = append(metrics, gpuMetrics...)
klog.V(3).Infof("Available GPU metrics: %v", gpuMetrics)
Expand Down
4 changes: 2 additions & 2 deletions pkg/metrics/metricfactory/metric_factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ func EnergyMetricsPromDesc(context string) (descriptions map[string]*prometheus.
// set the default source to trained power model
source := modeltypes.TrainedPowerModelSource
if strings.Contains(name, config.GPU) {
if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil {
if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil {
source = gpu.Device().Name()
}
} else if strings.Contains(name, config.PLATFORM) && platform.IsSystemCollectionSupported() {
Expand Down Expand Up @@ -87,7 +87,7 @@ func SCMetricsPromDesc(context string, bpfSupportedMetrics bpf.SupportedMetrics)
func GPUUsageMetricsPromDesc(context string) (descriptions map[string]*prometheus.Desc) {
descriptions = make(map[string]*prometheus.Desc)
if config.EnabledGPU() {
if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil {
if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil {
for _, name := range consts.GPUMetricNames {
descriptions[name] = resMetricsPromDesc(context, name, gpu.Device().Name())
}
Expand Down
3 changes: 2 additions & 1 deletion pkg/metrics/prometheus_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/sustainable-computing-io/kepler/pkg/bpf"
"github.com/sustainable-computing-io/kepler/pkg/collector"
"github.com/sustainable-computing-io/kepler/pkg/collector/stats"
"github.com/sustainable-computing-io/kepler/pkg/config"
"github.com/sustainable-computing-io/kepler/pkg/model"

acc "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator"
Expand Down Expand Up @@ -63,7 +64,7 @@ var _ = Describe("Test Prometheus Collector Unit", func() {
// we need to disable the system real time power metrics for testing since we add mock values or use power model estimator
components.SetIsSystemCollectionSupported(false)
platform.SetIsSystemCollectionSupported(false)
if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil {
if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil {
err := gpu.Device().Init() // create structure instances that will be accessed to create a containerMetric
Expect(err).NotTo(HaveOccurred())
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/metrics/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ func CollectResUtilizationMetrics(ch chan<- prometheus.Metric, instance interfac
CollectResUtil(ch, instance, collectorName, collectors[collectorName])
}
if config.EnabledGPU() {
if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil {
if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil {
for _, collectorName := range consts.GPUMetricNames {
CollectResUtil(ch, instance, collectorName, collectors[collectorName])
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/model/process_energy.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ func addEstimatedEnergy(processIDList []uint64, processesMetrics map[uint64]*sta
}
// estimate the associated power consumption of GPU for each process
if config.EnabledGPU() {
if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil {
if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil {
processGPUPower, errGPU = processComponentPowerModel.GetGPUPower(isIdlePower)
if errGPU != nil {
klog.V(5).Infoln("Could not estimate the Process GPU Power")
Expand Down
Loading

0 comments on commit 2c42b18

Please sign in to comment.