Skip to content

Commit

Permalink
DCGM receiver: Soft disable the receiver when DCGM not installed
Browse files Browse the repository at this point in the history
  • Loading branch information
LujieDuan committed Jul 20, 2023
1 parent a0d48c7 commit a0c2a40
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 4 deletions.
13 changes: 11 additions & 2 deletions receiver/dcgmreceiver/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ const maxWarningsForFailedDeviceMetricQuery = 5

type dcgmClient struct {
logger *zap.SugaredLogger
disable bool
handleCleanup func()
enabledFieldIDs []dcgm.Short
enabledFieldGroup dcgm.FieldHandle
Expand All @@ -56,8 +57,9 @@ var dcgmGetLatestValuesForFields = dcgm.GetLatestValuesForFields

func newClient(config *Config, logger *zap.Logger) (*dcgmClient, error) {
dcgmCleanup, err := initializeDcgm(config, logger)
// When DCGM is not installed or not running, return empty client
if err != nil {
return nil, err
return &dcgmClient{logger: logger.Sugar(), disable: true}, nil
}

deviceIndices, names, UUIDs, err := discoverDevices(logger)
Expand Down Expand Up @@ -282,7 +284,10 @@ func (client *dcgmClient) cleanup() {
if client.handleCleanup != nil {
client.handleCleanup()
}
client.logger.Info("Shutdown DCGM")

if !client.disable {
client.logger.Info("Shutdown DCGM")
}
}

func (client *dcgmClient) getDeviceModelName(gpuIndex uint) string {
Expand All @@ -294,6 +299,10 @@ func (client *dcgmClient) getDeviceUUID(gpuIndex uint) string {
}

func (client *dcgmClient) collectDeviceMetrics() ([]dcgmMetric, error) {
if client.disable {
return nil, nil
}

var err scrapererror.ScrapeErrors
gpuMetrics := make([]dcgmMetric, 0, len(client.enabledFieldIDs)*len(client.deviceIndices))
for _, gpuIndex := range client.deviceIndices {
Expand Down
5 changes: 3 additions & 2 deletions receiver/dcgmreceiver/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ func TestNewDcgmClientOnInitializationError(t *testing.T) {

client, err := newClient(createDefaultConfig().(*Config), logger)
assert.Equal(t, seenDcgmConnectionWarning, true)
assert.Regexp(t, ".*Unable to connect.*", err)
require.Nil(t, client)
require.NotNil(t, client)
require.Equal(t, client.disable, true)
require.Nil(t, err)
}

0 comments on commit a0c2a40

Please sign in to comment.