diff --git a/api/v1beta1/conditions_consts.go b/api/v1beta1/conditions_consts.go index fd65260..10f5efc 100644 --- a/api/v1beta1/conditions_consts.go +++ b/api/v1beta1/conditions_consts.go @@ -118,6 +118,15 @@ const ( // waiting for an available host required by placement group to create VM. WaitingForAvailableHostRequiredByPlacementGroupReason = "WaitingForAvailableHostRequiredByPlacementGroup" + // SelectingGPUFailedReason (Severity=Warning) documents an ElfMachine controller detecting + // an error while selecting GPU; those kind of errors are usually transient and failed updating + // are automatically re-tried by the controller. + SelectingGPUFailedReason = "SelectingGPUFailed" + + // WaitingForAvailableHostWithSufficientMemoryReason (Severity=Info) documents an ElfMachine + // waiting for an available host with sufficient memory to create VM. + WaitingForAvailableHostWithSufficientMemoryReason = "WaitingForAvailableHostWithSufficientMemory" + // WaitingForAvailableHostWithEnoughGPUsReason (Severity=Info) documents an ElfMachine // waiting for an available host with enough GPUs to create VM. WaitingForAvailableHostWithEnoughGPUsReason = "WaitingForAvailableHostWithEnoughGPUs" diff --git a/controllers/elfmachine_controller_gpu.go b/controllers/elfmachine_controller_gpu.go index 91bc801..963c21e 100644 --- a/controllers/elfmachine_controller_gpu.go +++ b/controllers/elfmachine_controller_gpu.go @@ -53,11 +53,18 @@ func (r *ElfMachineReconciler) selectHostAndGPUsForVM(ctx goctx.Context, machine return ptr.To(""), nil, nil } + var availableHosts service.Hosts defer func() { - if rethost == nil { - conditions.MarkFalse(machineCtx.ElfMachine, infrav1.VMProvisionedCondition, infrav1.WaitingForAvailableHostWithEnoughGPUsReason, clusterv1.ConditionSeverityInfo, "") - - log.V(1).Info("No host with the required GPU devices for the virtual machine, so wait for enough available hosts") + if reterr != nil { + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.VMProvisionedCondition, infrav1.SelectingGPUFailedReason, clusterv1.ConditionSeverityError, reterr.Error()) + } else if rethost == nil { + if availableHosts.Len() == 0 { + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.VMProvisionedCondition, infrav1.WaitingForAvailableHostWithSufficientMemoryReason, clusterv1.ConditionSeverityWarning, "") + log.V(1).Info("Waiting for enough available hosts") + } else { + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.VMProvisionedCondition, infrav1.WaitingForAvailableHostWithEnoughGPUsReason, clusterv1.ConditionSeverityInfo, "") + log.V(1).Info("No host with the required GPU devices for the virtual machine, so wait for enough available hosts") + } } }() @@ -83,9 +90,8 @@ func (r *ElfMachineReconciler) selectHostAndGPUsForVM(ctx goctx.Context, machine return nil, nil, err } - availableHosts := hosts.FilterAvailableHostsWithEnoughMemory(*service.TowerMemory(machineCtx.ElfMachine.Spec.MemoryMiB)) + availableHosts = hosts.FilterAvailableHostsWithEnoughMemory(*service.TowerMemory(machineCtx.ElfMachine.Spec.MemoryMiB)) if len(availableHosts) == 0 { - log.V(2).Info("Waiting for enough available hosts") return nil, nil, nil } diff --git a/controllers/elfmachine_controller_gpu_test.go b/controllers/elfmachine_controller_gpu_test.go index 240764d..52fed91 100644 --- a/controllers/elfmachine_controller_gpu_test.go +++ b/controllers/elfmachine_controller_gpu_test.go @@ -134,8 +134,9 @@ var _ = Describe("ElfMachineReconciler-GPU", func() { logBuffer.Reset() removeGPUVMInfosCache(gpuIDs) gpuVMInfo.Vms = []*models.GpuVMDetail{{ID: service.TowerString("id"), Name: service.TowerString("vm"), Status: models.NewVMStatus(models.VMStatusRUNNING)}} - mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(nil, nil) + mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host), nil) mockVMService.EXPECT().GetGPUDevicesAllocationInfoByIDs([]string{*gpuVMInfo.ID}).Return(gpuVMInfos, nil) + mockVMService.EXPECT().GetGPUDevicesAllocationInfoByHostIDs([]string{*host.ID}, models.GpuDeviceUsagePASSTHROUGH).Return(service.NewGPUVMInfos(), nil) hostID, gpus, err = reconciler.selectHostAndGPUsForVM(ctx, machineContext, "") Expect(err).NotTo(HaveOccurred()) Expect(hostID).To(BeNil()) @@ -268,7 +269,7 @@ var _ = Describe("ElfMachineReconciler-GPU", func() { Expect(elfMachine.Status.GPUDevices).To(Equal([]infrav1.GPUStatus{{GPUID: *vm.GpuDevices[0].ID, Name: *vm.GpuDevices[0].Name}})) }) - It("should add GPU devices to VM when the VM without GPU devices", func() { + It("should add GPU devices to VM when without available hosts", func() { host := fake.NewTowerHost() vm := fake.NewTowerVMFromElfMachine(elfMachine) vm.Host = &models.NestedHost{ID: host.ID} @@ -282,7 +283,13 @@ var _ = Describe("ElfMachineReconciler-GPU", func() { ok, err := reconciler.reconcileGPUDevices(ctx, machineContext, vm) Expect(err).To(HaveOccurred()) Expect(ok).To(BeFalse()) - expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.WaitingForAvailableHostWithEnoughGPUsReason}}) + expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityError, infrav1.SelectingGPUFailedReason}}) + + mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.Hosts{}, nil) + ok, err = reconciler.reconcileGPUDevices(ctx, machineContext, vm) + Expect(err).NotTo(HaveOccurred()) + Expect(ok).To(BeFalse()) + expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.WaitingForAvailableHostWithSufficientMemoryReason}}) }) It("should remove GPU devices to VM when detect host are not sufficient", func() {