Skip to content

Commit

Permalink
SKS-2999: Set WaitingForAvailableHostWithSufficientMemoryReason when …
Browse files Browse the repository at this point in the history
…no sufficient memory hosts (#188)
  • Loading branch information
haijianyang authored Dec 17, 2024
1 parent 0b94a34 commit 645e9dd
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 9 deletions.
9 changes: 9 additions & 0 deletions api/v1beta1/conditions_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,15 @@ const (
// waiting for an available host required by placement group to create VM.
WaitingForAvailableHostRequiredByPlacementGroupReason = "WaitingForAvailableHostRequiredByPlacementGroup"

// SelectingGPUFailedReason (Severity=Warning) documents an ElfMachine controller detecting
// an error while selecting GPU; those kind of errors are usually transient and failed updating
// are automatically re-tried by the controller.
SelectingGPUFailedReason = "SelectingGPUFailed"

// WaitingForAvailableHostWithSufficientMemoryReason (Severity=Info) documents an ElfMachine
// waiting for an available host with sufficient memory to create VM.
WaitingForAvailableHostWithSufficientMemoryReason = "WaitingForAvailableHostWithSufficientMemory"

// WaitingForAvailableHostWithEnoughGPUsReason (Severity=Info) documents an ElfMachine
// waiting for an available host with enough GPUs to create VM.
WaitingForAvailableHostWithEnoughGPUsReason = "WaitingForAvailableHostWithEnoughGPUs"
Expand Down
18 changes: 12 additions & 6 deletions controllers/elfmachine_controller_gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,18 @@ func (r *ElfMachineReconciler) selectHostAndGPUsForVM(ctx goctx.Context, machine
return ptr.To(""), nil, nil
}

var availableHosts service.Hosts
defer func() {
if rethost == nil {
conditions.MarkFalse(machineCtx.ElfMachine, infrav1.VMProvisionedCondition, infrav1.WaitingForAvailableHostWithEnoughGPUsReason, clusterv1.ConditionSeverityInfo, "")

log.V(1).Info("No host with the required GPU devices for the virtual machine, so wait for enough available hosts")
if reterr != nil {
conditions.MarkFalse(machineCtx.ElfMachine, infrav1.VMProvisionedCondition, infrav1.SelectingGPUFailedReason, clusterv1.ConditionSeverityError, reterr.Error())
} else if rethost == nil {
if availableHosts.Len() == 0 {
conditions.MarkFalse(machineCtx.ElfMachine, infrav1.VMProvisionedCondition, infrav1.WaitingForAvailableHostWithSufficientMemoryReason, clusterv1.ConditionSeverityWarning, "")
log.V(1).Info("Waiting for enough available hosts")
} else {
conditions.MarkFalse(machineCtx.ElfMachine, infrav1.VMProvisionedCondition, infrav1.WaitingForAvailableHostWithEnoughGPUsReason, clusterv1.ConditionSeverityInfo, "")
log.V(1).Info("No host with the required GPU devices for the virtual machine, so wait for enough available hosts")
}
}
}()

Expand All @@ -83,9 +90,8 @@ func (r *ElfMachineReconciler) selectHostAndGPUsForVM(ctx goctx.Context, machine
return nil, nil, err
}

availableHosts := hosts.FilterAvailableHostsWithEnoughMemory(*service.TowerMemory(machineCtx.ElfMachine.Spec.MemoryMiB))
availableHosts = hosts.FilterAvailableHostsWithEnoughMemory(*service.TowerMemory(machineCtx.ElfMachine.Spec.MemoryMiB))
if len(availableHosts) == 0 {
log.V(2).Info("Waiting for enough available hosts")
return nil, nil, nil
}

Expand Down
13 changes: 10 additions & 3 deletions controllers/elfmachine_controller_gpu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,9 @@ var _ = Describe("ElfMachineReconciler-GPU", func() {
logBuffer.Reset()
removeGPUVMInfosCache(gpuIDs)
gpuVMInfo.Vms = []*models.GpuVMDetail{{ID: service.TowerString("id"), Name: service.TowerString("vm"), Status: models.NewVMStatus(models.VMStatusRUNNING)}}
mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(nil, nil)
mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host), nil)
mockVMService.EXPECT().GetGPUDevicesAllocationInfoByIDs([]string{*gpuVMInfo.ID}).Return(gpuVMInfos, nil)
mockVMService.EXPECT().GetGPUDevicesAllocationInfoByHostIDs([]string{*host.ID}, models.GpuDeviceUsagePASSTHROUGH).Return(service.NewGPUVMInfos(), nil)
hostID, gpus, err = reconciler.selectHostAndGPUsForVM(ctx, machineContext, "")
Expect(err).NotTo(HaveOccurred())
Expect(hostID).To(BeNil())
Expand Down Expand Up @@ -268,7 +269,7 @@ var _ = Describe("ElfMachineReconciler-GPU", func() {
Expect(elfMachine.Status.GPUDevices).To(Equal([]infrav1.GPUStatus{{GPUID: *vm.GpuDevices[0].ID, Name: *vm.GpuDevices[0].Name}}))
})

It("should add GPU devices to VM when the VM without GPU devices", func() {
It("should add GPU devices to VM when without available hosts", func() {
host := fake.NewTowerHost()
vm := fake.NewTowerVMFromElfMachine(elfMachine)
vm.Host = &models.NestedHost{ID: host.ID}
Expand All @@ -282,7 +283,13 @@ var _ = Describe("ElfMachineReconciler-GPU", func() {
ok, err := reconciler.reconcileGPUDevices(ctx, machineContext, vm)
Expect(err).To(HaveOccurred())
Expect(ok).To(BeFalse())
expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.WaitingForAvailableHostWithEnoughGPUsReason}})
expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityError, infrav1.SelectingGPUFailedReason}})

mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.Hosts{}, nil)
ok, err = reconciler.reconcileGPUDevices(ctx, machineContext, vm)
Expect(err).NotTo(HaveOccurred())
Expect(ok).To(BeFalse())
expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.WaitingForAvailableHostWithSufficientMemoryReason}})
})

It("should remove GPU devices to VM when detect host are not sufficient", func() {
Expand Down

0 comments on commit 645e9dd

Please sign in to comment.