From a9364d520a9cabbe30ad9676eed72ffdf2459313 Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Fri, 12 Apr 2024 21:26:47 +0000 Subject: [PATCH] Parallelize tests to allow us to reenable testing in pkg/... Signed-off-by: Kevin Klues --- Makefile | 2 +- pkg/mig/config/config_test.go | 29 ++++++++++++++++++---------- pkg/mig/config/known_configs.go | 23 ++++++++++++++++------ pkg/mig/config/known_configs_test.go | 4 +++- pkg/mig/state/state_test.go | 17 +++++++++------- 5 files changed, 50 insertions(+), 25 deletions(-) diff --git a/Makefile b/Makefile index dd8ef674..af5b121e 100644 --- a/Makefile +++ b/Makefile @@ -83,7 +83,7 @@ check-vendor: vendor COVERAGE_FILE := coverage.out test: build cmds - go test -coverprofile=$(COVERAGE_FILE) $(MODULE)/cmd/... $(MODULE)/internal/... $(MODULE)/api/... + go test -v -coverprofile=$(COVERAGE_FILE) $(MODULE)/cmd/... $(MODULE)/internal/... $(MODULE)/api/... $(MODULE)/pkg/... coverage: test cat $(COVERAGE_FILE) | grep -v "_mock.go" > $(COVERAGE_FILE).no-mocks diff --git a/pkg/mig/config/config_test.go b/pkg/mig/config/config_test.go index ea326378..d937ad96 100644 --- a/pkg/mig/config/config_test.go +++ b/pkg/mig/config/config_test.go @@ -44,13 +44,6 @@ func EnableMigMode(manager Manager, gpu int) (nvml.Return, nvml.Return) { } func TestGetSetMigConfig(t *testing.T) { - nvmlLib := dgxa100.New() - manager := NewMockLunaServerMigConfigManager() - - numGPUs, ret := nvmlLib.DeviceGetCount() - require.NotNil(t, ret, "Unexpected nil return from DeviceGetCount") - require.Equal(t, ret, nvml.SUCCESS, "Unexpected return value from DeviceGetCount") - mcg := NewA100_SXM4_40GB_MigConfigGroup() type testCase struct { @@ -69,8 +62,18 @@ func TestGetSetMigConfig(t *testing.T) { return testCases }() - for _, tc := range testCases { + for i := range testCases { + tc := testCases[i] // to allow us to run parallelly t.Run(tc.description, func(t *testing.T) { + t.Parallel() + + nvmlLib := dgxa100.New() + manager := NewMockLunaServerMigConfigManager() + + numGPUs, ret := nvmlLib.DeviceGetCount() + require.NotNil(t, ret, "Unexpected nil return from DeviceGetCount") + require.Equal(t, ret, nvml.SUCCESS, "Unexpected return value from DeviceGetCount") + for i := 0; i < numGPUs; i++ { r1, r2 := EnableMigMode(manager, i) require.Equal(t, nvml.SUCCESS, r1) @@ -106,8 +109,11 @@ func TestClearMigConfig(t *testing.T) { return testCases }() - for _, tc := range testCases { + for i := range testCases { + tc := testCases[i] // to allow us to run parallelly t.Run(tc.description, func(t *testing.T) { + t.Parallel() + manager := NewMockLunaServerMigConfigManager() r1, r2 := EnableMigMode(manager, 0) @@ -173,8 +179,11 @@ func TestIteratePermutationsUntilSuccess(t *testing.T) { return testCases }() - for _, tc := range testCases { + for i := range testCases { + tc := testCases[i] // to allow us to run parallelly t.Run(tc.description, func(t *testing.T) { + t.Parallel() + iteration := 0 err := iteratePermutationsUntilSuccess(tc.config, func(perm []*types.MigProfile) error { iteration++ diff --git a/pkg/mig/config/known_configs.go b/pkg/mig/config/known_configs.go index 1f4125f7..28d1be57 100644 --- a/pkg/mig/config/known_configs.go +++ b/pkg/mig/config/known_configs.go @@ -19,6 +19,7 @@ package config import ( "fmt" "sort" + "sync" "github.com/NVIDIA/mig-parted/pkg/types" ) @@ -73,8 +74,9 @@ func NewA100_SXM4_40GB_MigConfigGroup() types.MigConfigGroup { } func (m *a100_sxm4_40gb_MigConfigGroup) init() { + var mutex sync.RWMutex configs := make(map[string]types.MigConfig) - m.iterateDeviceTypes(func(mps []*types.MigProfile) LoopControl { + m.parallelIterateDeviceTypes(func(mps []*types.MigProfile) LoopControl { cis := 0 cis_per_gi := make(map[int]int) mes_per_gi := make(map[int]int) @@ -107,10 +109,12 @@ func (m *a100_sxm4_40gb_MigConfigGroup) init() { return mps[i].String() < mps[j].String() }) + mutex.Lock() str := fmt.Sprintf("%v", mps) if _, exists := configs[str]; !exists { configs[str] = types.NewMigConfig(mps) } + mutex.Unlock() if cis < 7 { return Continue @@ -118,9 +122,11 @@ func (m *a100_sxm4_40gb_MigConfigGroup) init() { return Break }) + mutex.RLock() for _, v := range configs { m.Configs = append(m.Configs, v) } + mutex.RUnlock() } func (m *a100_sxm4_40gb_MigConfigGroup) GetDeviceTypes() []*types.MigProfile { @@ -143,7 +149,7 @@ func (m *a100_sxm4_40gb_MigConfigGroup) GetDeviceTypes() []*types.MigProfile { } } -func (m *a100_sxm4_40gb_MigConfigGroup) iterateDeviceTypes(f func([]*types.MigProfile) LoopControl) { +func (m *a100_sxm4_40gb_MigConfigGroup) parallelIterateDeviceTypes(f func([]*types.MigProfile) LoopControl) { maxDevices := types.MigConfig{ mig_1c_1g_5gb: 7, mig_1c_1g_5gb_me: 1, @@ -162,20 +168,25 @@ func (m *a100_sxm4_40gb_MigConfigGroup) iterateDeviceTypes(f func([]*types.MigPr mig_7c_7g_40gb: 1, }.Flatten() - var iterate func(i int, accum []*types.MigProfile) LoopControl - iterate = func(i int, accum []*types.MigProfile) LoopControl { + var iterate func(i int, accum []*types.MigProfile, wg *sync.WaitGroup) LoopControl + iterate = func(i int, accum []*types.MigProfile, wg *sync.WaitGroup) LoopControl { + defer wg.Done() accum = append(accum, maxDevices[i]) control := f(accum) if control == Break { return Continue } for j := i + 1; j < len(maxDevices); j++ { - iterate(j, accum) + wg.Add(1) + go iterate(j, accum, wg) } return Continue } + var wg sync.WaitGroup for i := 0; i < len(maxDevices); i++ { - iterate(i, []*types.MigProfile{}) + wg.Add(1) + go iterate(i, []*types.MigProfile{}, &wg) } + wg.Wait() } diff --git a/pkg/mig/config/known_configs_test.go b/pkg/mig/config/known_configs_test.go index 33695ff5..bb337017 100644 --- a/pkg/mig/config/known_configs_test.go +++ b/pkg/mig/config/known_configs_test.go @@ -132,8 +132,10 @@ func TestValidConfiguration(t *testing.T) { types.SetMockNVdevlib() configs := GetKnownMigConfigGroups() - for _, tc := range testCases { + for i := range testCases { + tc := testCases[i] // to allow us to run parallelly t.Run(tc.description, func(t *testing.T) { + t.Parallel() err := configs[tc.gpu].AssertValidConfiguration(tc.config) if tc.valid { require.Nil(t, err) diff --git a/pkg/mig/state/state_test.go b/pkg/mig/state/state_test.go index e9c161b3..1306b6e1 100644 --- a/pkg/mig/state/state_test.go +++ b/pkg/mig/state/state_test.go @@ -36,12 +36,6 @@ func newMockMigStateManagerOnLunaServer() *migStateManager { } func TestFetchRestore(t *testing.T) { - manager := newMockMigStateManagerOnLunaServer() - - numGPUs, ret := manager.nvml.DeviceGetCount() - require.NotNil(t, ret, "Unexpected nil return from DeviceGetCount") - require.Equal(t, ret, nvml.SUCCESS, "Unexpected return value from DeviceGetCount") - mcg := config.NewA100_SXM4_40GB_MigConfigGroup() type testCase struct { @@ -73,8 +67,17 @@ func TestFetchRestore(t *testing.T) { return testCases }() - for _, tc := range testCases { + for i := range testCases { + tc := testCases[i] // to allow us to run parallelly t.Run(tc.description, func(t *testing.T) { + t.Parallel() + + manager := newMockMigStateManagerOnLunaServer() + + numGPUs, ret := manager.nvml.DeviceGetCount() + require.NotNil(t, ret, "Unexpected nil return from DeviceGetCount") + require.Equal(t, ret, nvml.SUCCESS, "Unexpected return value from DeviceGetCount") + for i := 0; i < numGPUs; i++ { err := manager.mode.SetMigMode(i, tc.mode) require.Nil(t, err)