Skip to content

Commit

Permalink
Add GPU metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
ioppermann committed Oct 30, 2024
1 parent dd8906e commit ed5357c
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 0 deletions.
1 change: 1 addition & 0 deletions app/api/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -1234,6 +1234,7 @@ func (a *api) start(ctx context.Context) error {
metrics.Register(monitor.NewUptimeCollector())
metrics.Register(monitor.NewCPUCollector(a.resources))
metrics.Register(monitor.NewMemCollector(a.resources))
metrics.Register(monitor.NewGPUCollector(a.resources))
metrics.Register(monitor.NewNetCollector(a.resources))
metrics.Register(monitor.NewDiskCollector(a.diskfs.Metadata("base"), a.resources))
metrics.Register(monitor.NewFilesystemCollector("diskfs", a.diskfs))
Expand Down
79 changes: 79 additions & 0 deletions monitor/gpu.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
package monitor

import (
"fmt"

"github.com/datarhei/core/v16/monitor/metric"
"github.com/datarhei/core/v16/resources"
)

type gpuCollector struct {
ngpuDescr *metric.Description
usageDescr *metric.Description
encoderDescr *metric.Description
decoderDescr *metric.Description
memoryTotalDescr *metric.Description
memoryFreeDescr *metric.Description
memoryLimitDescr *metric.Description
limitDescr *metric.Description

resources resources.Resources
}

func NewGPUCollector(rsc resources.Resources) metric.Collector {
c := &gpuCollector{
resources: rsc,
}

c.ngpuDescr = metric.NewDesc("gpu_ngpu", "Number of GPUs in the system", nil)
c.usageDescr = metric.NewDesc("gpu_usage", "Percentage of GPU used ", []string{"index"})
c.encoderDescr = metric.NewDesc("gpu_encoder", "Percentage of GPU encoder used", []string{"index"})
c.decoderDescr = metric.NewDesc("gpu_decoder", "Percentage of GPU decoder used", []string{"index"})
c.memoryTotalDescr = metric.NewDesc("gpu_mem_total", "GPU memory total in bytes", []string{"index"})
c.memoryFreeDescr = metric.NewDesc("gpu_mem_free", "GPU memory available in bytes", []string{"index"})
c.memoryLimitDescr = metric.NewDesc("gpu_mem_limit", "GPU memory limit in bytes", []string{"index"})
c.limitDescr = metric.NewDesc("gpu_limit", "Percentage of GPU to be consumed", []string{"index"})

return c
}

func (c *gpuCollector) Stop() {}

func (c *gpuCollector) Prefix() string {
return "cpu"
}

func (c *gpuCollector) Describe() []*metric.Description {
return []*metric.Description{
c.ngpuDescr,
c.usageDescr,
c.encoderDescr,
c.decoderDescr,
c.memoryTotalDescr,
c.memoryFreeDescr,
c.memoryLimitDescr,
c.limitDescr,
}
}

func (c *gpuCollector) Collect() metric.Metrics {
metrics := metric.NewMetrics()

rinfo := c.resources.Info()

metrics.Add(metric.NewValue(c.ngpuDescr, rinfo.GPU.NGPU))

for i, gpu := range rinfo.GPU.GPU {
index := fmt.Sprintf("%d", i)
metrics.Add(metric.NewValue(c.usageDescr, gpu.Usage, index))
metrics.Add(metric.NewValue(c.encoderDescr, gpu.Encoder, index))
metrics.Add(metric.NewValue(c.decoderDescr, gpu.Decoder, index))
metrics.Add(metric.NewValue(c.limitDescr, gpu.UsageLimit, index))

metrics.Add(metric.NewValue(c.memoryTotalDescr, float64(gpu.MemoryTotal), index))
metrics.Add(metric.NewValue(c.memoryFreeDescr, float64(gpu.MemoryAvailable), index))
metrics.Add(metric.NewValue(c.memoryLimitDescr, float64(gpu.MemoryLimit), index))
}

return metrics
}

0 comments on commit ed5357c

Please sign in to comment.