Skip to content

Commit

Permalink
gpu: switch to dcgm standalone mode due to containerization limitations
Browse files Browse the repository at this point in the history
Signed-off-by: Huamin Chen <[email protected]>
  • Loading branch information
rootfs committed Feb 21, 2024
1 parent f89960d commit d42bbdd
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 8 deletions.
3 changes: 2 additions & 1 deletion build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ RUN yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.n
RUN if [ $(uname -i) == "x86_64" ]; then yum install -y cpuid; fi

ENV NVIDIA_VISIBLE_DEVICES=all
# add utility to support nvidia-smi
ENV NVIDIA_DRIVER_CAPABILITIES=utility
ENV NVIDIA_MIG_CONFIG_DEVICES=all
ENV NVIDIA_MIG_MONITOR_DEVICES=all

RUN INSTALL_PKGS=" \
libbpf \
Expand Down
3 changes: 2 additions & 1 deletion cmd/exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,10 +203,11 @@ func main() {
// the GPU operators typically takes longer time to initialize than kepler resulting in error to start the gpu driver
// therefore, we wait up to 1 min to allow the gpu operator initialize
for i := 0; i <= maxGPUInitRetry; i++ {
time.Sleep(6 * time.Second)
err = gpu.Init()
if err == nil {
break
} else {
time.Sleep(6 * time.Second)
}
}
if err == nil {
Expand Down
3 changes: 3 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ var (

configPath = "/etc/kepler/kepler.config"

// nvidia dcgm hostengine endpoint
DCGMHostEngineEndpoint = getConfig("NVIDIA_HOSTENGINE_ENDPOINT", "localhost:5555")

// dir of kernel sources for bcc
kernelSourceDirs = []string{}

Expand Down
3 changes: 3 additions & 0 deletions pkg/sensors/accelerator/gpu/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,14 @@ Then, we use gpu.go file to initialize the acceleratorImpl from power.go when gp
func init() {
var errLib error
for i := 0; i < len(acceleratorOrder); i++ {
klog.Infof("Trying to initialize GPU collector using %s", acceleratorOrder[i].GetName())
acceleratorImpl = acceleratorOrder[i]
errLib = acceleratorImpl.InitLib()
if errLib == nil {
klog.Infof("Using %s to obtain gpu power", acceleratorImpl.GetName())
return
} else {
klog.Infof("Error initializing %s: %v", acceleratorImpl.GetName(), errLib)
}
}
klog.Infof("no gpu collector available: %v", errLib)
Expand Down
26 changes: 20 additions & 6 deletions pkg/sensors/accelerator/gpu/source/gpu_dcgm.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ var (

type GPUDcgm struct {
collectionSupported bool
libInited bool
devices map[string]interface{}
deviceGroupName string
deviceGroupHandle dcgm.GroupHandle
Expand All @@ -70,12 +71,13 @@ func (d *GPUDcgm) InitLib() error {
d.devices = make(map[string]interface{})
d.entities = make(map[string]dcgm.GroupEntityPair)

cleanup, err := dcgm.Init(dcgm.Embedded)
// cleanup, err := dcgm.Init(dcgm.Embedded) // embeded mode is not recommended for production per https://github.com/NVIDIA/dcgm-exporter/issues/22#issuecomment-1321521995
cleanup, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint, "0")
if err != nil {
if cleanup != nil {
cleanup()
}
return fmt.Errorf("not able to connect to DCGM: %s", err)
return fmt.Errorf("not able to connect to DCGM %v: %s", config.DCGMHostEngineEndpoint, err)
}
d.cleanup = cleanup
dcgm.FieldsInit()
Expand All @@ -84,26 +86,37 @@ func (d *GPUDcgm) InitLib() error {
d.Shutdown()
return err
}
d.libInited = true
return nil
}

func (d *GPUDcgm) Init() error {
if !d.libInited {
if err := d.InitLib(); err != nil {
klog.Infof("failed to init lib: %v", err)
return err
}
}
if err := d.createDeviceGroup(); err != nil {
klog.Infof("failed to create device group: %v", err)
d.Shutdown()
return err
}

if err := d.addDevicesToGroup(); err != nil {
klog.Infof("failed to add devices to group: %v", err)
d.Shutdown()
return err
}

if err := d.createFieldGroup(); err != nil {
klog.Infof("failed to create field group: %v", err)
d.Shutdown()
return err
}

if err := d.setupWatcher(); err != nil {
klog.Infof("failed to set up watcher: %v", err)
d.Shutdown()
return err
}
Expand Down Expand Up @@ -133,6 +146,7 @@ func (d *GPUDcgm) Shutdown() bool {
d.cleanup()
}
d.collectionSupported = false
d.libInited = false
return true
}

Expand Down Expand Up @@ -197,8 +211,8 @@ func (d *GPUDcgm) GetProcessResourceUtilizationPerDevice(device interface{}, dev
return processAcceleratorMetrics, fmt.Errorf("failed to get running processes: %v", nvml.ErrorString(ret))
}
for _, p := range processInfo {
// klog.V(debugLevel).Infof("pid: %d, memUtil: %d gpu instance id %d compute id %d\n", p.Pid, p.UsedGpuMemory, p.GpuInstanceId, p.ComputeInstanceId)
if p.GpuInstanceId > 0 { // this is a MIG, get it entity id and reads the related fields
klog.V(debugLevel).Infof("pid: %d, memUtil: %d gpu instance id %d compute id %d\n", p.Pid, p.UsedGpuMemory, p.GpuInstanceId, p.ComputeInstanceId)
if p.GpuInstanceId > 0 && p.GpuInstanceId < uint32(len(gpuMigArray[deviceIndex])) { // this is a MIG, get it entity id and reads the related fields
entityName := gpuMigArray[deviceIndex][p.GpuInstanceId].EntityName
multiprocessorCountRatio := gpuMigArray[deviceIndex][p.GpuInstanceId].MultiprocessorCountRatio
mi := d.entities[entityName]
Expand Down Expand Up @@ -245,7 +259,7 @@ func (d *GPUDcgm) initNVML() error {
}

func (d *GPUDcgm) createDeviceGroup() error {
deviceGroupName := "kepler-exporter-" + time.Now().Format("2006-01-02-15-04-05")
deviceGroupName := "dev-grp-" + time.Now().Format("2006-01-02-15-04-05")
deviceGroup, err := dcgm.CreateGroup(deviceGroupName)
if err != nil {
return fmt.Errorf("failed to create group %q: %v", deviceGroupName, err)
Expand Down Expand Up @@ -310,7 +324,7 @@ func (d *GPUDcgm) addDevicesToGroup() error {
}

func (d *GPUDcgm) createFieldGroup() error {
fieldGroupName := "kepler-exporter-" + time.Now().Format("2006-01-02-15-04-05")
fieldGroupName := "fld-grp-" + time.Now().Format("2006-01-02-15-04-05")
fieldGroup, err := dcgm.FieldGroupCreate(fieldGroupName, deviceFields)
if err != nil {
return fmt.Errorf("failed to create field group %q: %v", fieldGroupName, err)
Expand Down
9 changes: 9 additions & 0 deletions pkg/sensors/accelerator/gpu/source/gpu_nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ var (
)

type GPUNvml struct {
libInited bool
collectionSupported bool
}

Expand All @@ -57,10 +58,17 @@ func (n *GPUNvml) InitLib() (err error) {
err = fmt.Errorf("failed to init nvml. %s", nvmlErrorString(ret))
return err
}
n.libInited = true
return nil
}

func (n *GPUNvml) Init() (err error) {
if !n.libInited {
if err := n.InitLib(); err != nil {
return err
}
}

count, ret := nvml.DeviceGetCount()
if ret != nvml.SUCCESS {
nvml.Shutdown()
Expand Down Expand Up @@ -89,6 +97,7 @@ func (n *GPUNvml) Init() (err error) {

// Shutdown stops the GPU metric collector
func (n *GPUNvml) Shutdown() bool {
n.libInited = false
return nvml.Shutdown() == nvml.SUCCESS
}

Expand Down

0 comments on commit d42bbdd

Please sign in to comment.