From a0c2a409b88f28bc09a624ef84fa635204746f9f Mon Sep 17 00:00:00 2001 From: Lujie Duan Date: Tue, 18 Jul 2023 15:14:06 +0000 Subject: [PATCH] DCGM receiver: Soft disable the receiver when DCGM not installed --- receiver/dcgmreceiver/client.go | 13 +++++++++++-- receiver/dcgmreceiver/client_test.go | 5 +++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index ff3a14113..f3e5407eb 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -31,6 +31,7 @@ const maxWarningsForFailedDeviceMetricQuery = 5 type dcgmClient struct { logger *zap.SugaredLogger + disable bool handleCleanup func() enabledFieldIDs []dcgm.Short enabledFieldGroup dcgm.FieldHandle @@ -56,8 +57,9 @@ var dcgmGetLatestValuesForFields = dcgm.GetLatestValuesForFields func newClient(config *Config, logger *zap.Logger) (*dcgmClient, error) { dcgmCleanup, err := initializeDcgm(config, logger) + // When DCGM is not installed or not running, return empty client if err != nil { - return nil, err + return &dcgmClient{logger: logger.Sugar(), disable: true}, nil } deviceIndices, names, UUIDs, err := discoverDevices(logger) @@ -282,7 +284,10 @@ func (client *dcgmClient) cleanup() { if client.handleCleanup != nil { client.handleCleanup() } - client.logger.Info("Shutdown DCGM") + + if !client.disable { + client.logger.Info("Shutdown DCGM") + } } func (client *dcgmClient) getDeviceModelName(gpuIndex uint) string { @@ -294,6 +299,10 @@ func (client *dcgmClient) getDeviceUUID(gpuIndex uint) string { } func (client *dcgmClient) collectDeviceMetrics() ([]dcgmMetric, error) { + if client.disable { + return nil, nil + } + var err scrapererror.ScrapeErrors gpuMetrics := make([]dcgmMetric, 0, len(client.enabledFieldIDs)*len(client.deviceIndices)) for _, gpuIndex := range client.deviceIndices { diff --git a/receiver/dcgmreceiver/client_test.go b/receiver/dcgmreceiver/client_test.go index 27877e656..66d29357a 100644 --- a/receiver/dcgmreceiver/client_test.go +++ b/receiver/dcgmreceiver/client_test.go @@ -46,6 +46,7 @@ func TestNewDcgmClientOnInitializationError(t *testing.T) { client, err := newClient(createDefaultConfig().(*Config), logger) assert.Equal(t, seenDcgmConnectionWarning, true) - assert.Regexp(t, ".*Unable to connect.*", err) - require.Nil(t, client) + require.NotNil(t, client) + require.Equal(t, client.disable, true) + require.Nil(t, err) }