Skip to content

Commit

Permalink
improve conditions
Browse files Browse the repository at this point in the history
  • Loading branch information
huaqing1994 committed Oct 31, 2024
1 parent 6294751 commit a9903bc
Show file tree
Hide file tree
Showing 9 changed files with 73 additions and 160 deletions.
3 changes: 0 additions & 3 deletions api/v1beta1/conditions_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,6 @@ const (
// ResourcesHotUpdatedCondition documents the status of the hot updating resources of a VM.
ResourcesHotUpdatedCondition = "ResourceHotUpdated"

// WaitingForResourcesHotUpdateReason (Severity=Info) documents an ElfMachine waiting for updating resources.
WaitingForResourcesHotUpdateReason = "WaitingForResourcesHotUpdate"

// ExpandingVMDiskReason documents (Severity=Info) ElfMachine currently executing the expand disk operation.
ExpandingVMDiskReason = "ExpandingVMDisk"

Expand Down
11 changes: 8 additions & 3 deletions controllers/elfmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,8 @@ func (r *ElfMachineReconciler) Reconcile(ctx goctx.Context, req ctrl.Request) (r
// always update the readyCondition.
conditions.SetSummary(machineCtx.ElfMachine,
conditions.WithConditions(
infrav1.VMProvisionedCondition,
infrav1.ResourcesHotUpdatedCondition,
infrav1.VMProvisionedCondition,
infrav1.TowerAvailableCondition,
),
)
Expand Down Expand Up @@ -973,7 +973,7 @@ func (r *ElfMachineReconciler) reconcileVMTask(ctx goctx.Context, machineCtx *co
machineCtx.ElfMachine.SetVMFirstBootTimestamp(&now)
}

if service.IsCloneVMTask(task) || service.IsPowerOnVMTask(task) {
if service.IsCloneVMTask(task) || service.IsPowerOnVMTask(task) || service.IsUpdateVMTask(task) {
releaseTicketForCreateVM(machineCtx.ElfMachine.Name)
recordElfClusterStorageInsufficient(machineCtx, false)
recordElfClusterMemoryInsufficient(machineCtx, false)
Expand Down Expand Up @@ -1024,7 +1024,12 @@ func (r *ElfMachineReconciler) reconcileVMFailedTask(ctx goctx.Context, machineC
case service.IsUpdateVMDiskTask(task, machineCtx.ElfMachine.Name):
reason := conditions.GetReason(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition)
if reason == infrav1.ExpandingVMDiskReason || reason == infrav1.ExpandingVMDiskFailedReason {
conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskFailedReason, clusterv1.ConditionSeverityInfo, errorMessage)
conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskFailedReason, clusterv1.ConditionSeverityWarning, errorMessage)
}
case service.IsUpdateVMTask(task) && conditions.IsFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition):
reason := conditions.GetReason(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition)
if reason == infrav1.ExpandingVMResourcesReason || reason == infrav1.ExpandingVMResourcesFailedReason {
conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMResourcesFailedReason, clusterv1.ConditionSeverityWarning, errorMessage)
}
case service.IsPowerOnVMTask(task) || service.IsUpdateVMTask(task) || service.IsVMColdMigrationTask(task):
if machineCtx.ElfMachine.RequiresGPUDevices() {
Expand Down
4 changes: 0 additions & 4 deletions controllers/elfmachine_controller_resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,10 +246,6 @@ func (r *ElfMachineReconciler) reconcileVMCPUAndMemory(ctx goctx.Context, machin
return false, errors.Wrapf(err, "failed to trigger update CPU and memory for VM %s", *vm.Name)
}

if reason == infrav1.ExpandingVMResourcesFailedReason {
conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMResourcesReason, clusterv1.ConditionSeverityInfo, "")
}

machineCtx.ElfMachine.SetTask(*withTaskVM.TaskID)

log.Info("Waiting for the VM to be updated CPU and memory", "vmRef", machineCtx.ElfMachine.Status.VMRef, "taskRef", machineCtx.ElfMachine.Status.TaskRef)
Expand Down
69 changes: 41 additions & 28 deletions controllers/elfmachine_controller_resources_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ package controllers
import (
"bytes"
goctx "context"
"fmt"
"time"

"github.com/go-logr/logr"
Expand All @@ -27,6 +26,7 @@ import (
agentv1 "github.com/smartxworks/host-config-agent-api/api/v1alpha1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
Expand Down Expand Up @@ -78,19 +78,6 @@ var _ = Describe("ElfMachineReconciler", func() {
})

Context("reconcileVMResources", func() {
It("should reconcile when WaitingForResourcesHotUpdateReason is not empty", func() {
conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "xx")
ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret)
fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine)
machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService)
vm := fake.NewTowerVMFromElfMachine(elfMachine)
reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService}
ok, err := reconciler.reconcileVMResources(ctx, machineContext, vm)
Expect(ok).To(BeFalse())
Expect(err).NotTo(HaveOccurred())
Expect(logBuffer.String()).To(ContainSubstring("Waiting for hot updating resources"))
})

It("should mark ResourcesHotUpdatedCondition to true", func() {
agentJob := newExpandRootPartitionJob(elfMachine)
Expect(testEnv.CreateAndWait(ctx, agentJob)).NotTo(HaveOccurred())
Expand All @@ -113,9 +100,9 @@ var _ = Describe("ElfMachineReconciler", func() {
mockVMService.EXPECT().GetVMVolume(*vmVolume.ID).Return(vmVolume, nil)
reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService}
ok, err := reconciler.reconcileVMResources(ctx, machineContext, vm)
Expect(ok).To(BeTrue())
Expect(err).NotTo(HaveOccurred())
expectConditions(elfMachine, []conditionAssertion{{conditionType: infrav1.ResourcesHotUpdatedCondition, status: corev1.ConditionTrue}})
Expect(err).NotTo(HaveOccurred())
Expect(ok).To(BeTrue())
})
})

Expand Down Expand Up @@ -205,17 +192,29 @@ var _ = Describe("ElfMachineReconciler", func() {
fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine)
machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService)
vmVolume := fake.NewVMVolume(elfMachine)
mockVMService.EXPECT().ResizeVMVolume(*vmVolume.ID, int64(10)).Return(nil, unexpectedError)
task := fake.NewTowerTask("")
withTaskVMVolume := fake.NewWithTaskVMVolume(vmVolume, task)
mockVMService.EXPECT().ResizeVMVolume(*vmVolume.ID, int64(10)).Return(withTaskVMVolume, nil)
reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService}
err := reconciler.resizeVMVolume(ctx, machineContext, vmVolume, 10, infrav1.ResourcesHotUpdatedCondition)
Expect(err).ToNot(HaveOccurred())
Expect(logBuffer.String()).To(ContainSubstring("Waiting for the vm volume"))
expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingVMDiskReason}})

mockVMService.EXPECT().ResizeVMVolume(*vmVolume.ID, int64(10)).Return(nil, unexpectedError)
ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret)
fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine)
machineContext = newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService)
reconciler = &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService}
err = reconciler.resizeVMVolume(ctx, machineContext, vmVolume, 10, infrav1.ResourcesHotUpdatedCondition)
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("failed to trigger expand size from"))
Expect(elfMachine.Status.TaskRef).To(Equal(*withTaskVMVolume.TaskID))
expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.ExpandingVMDiskFailedReason}})

task := fake.NewTowerTask("")
withTaskVMVolume := fake.NewWithTaskVMVolume(vmVolume, task)
task = fake.NewTowerTask("")
withTaskVMVolume = fake.NewWithTaskVMVolume(vmVolume, task)
mockVMService.EXPECT().ResizeVMVolume(*vmVolume.ID, int64(10)).Return(withTaskVMVolume, nil)
conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "")
ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret)
fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine)
machineContext = newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService)
Expand All @@ -224,7 +223,7 @@ var _ = Describe("ElfMachineReconciler", func() {
Expect(err).NotTo(HaveOccurred())
Expect(logBuffer.String()).To(ContainSubstring("Waiting for the vm volume"))
Expect(elfMachine.Status.TaskRef).To(Equal(*withTaskVMVolume.TaskID))
expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingVMDiskReason}})
expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.ExpandingVMDiskFailedReason}})
})
})

Expand Down Expand Up @@ -470,7 +469,8 @@ var _ = Describe("ElfMachineReconciler", func() {
Expect(ok).To(BeTrue())
Expect(err).NotTo(HaveOccurred())
Expect(elfMachine.Status.Resources.CPUCores).To(Equal(*vm.Vcpu))
Expect(elfMachine.Status.Resources.Memory.String()).To(Equal(fmt.Sprintf("%dMi", service.ByteToMiB(*vm.Memory))))
specMemory := *resource.NewQuantity(service.ByteToMiB(*vm.Memory)*1024*1024, resource.BinarySI)
Expect(elfMachine.Status.Resources.Memory.Equal(specMemory)).To(BeTrue())
})

It("should save the conditionType first", func() {
Expand All @@ -493,26 +493,39 @@ var _ = Describe("ElfMachineReconciler", func() {
ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret)
fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine)
machineCtx := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService)
mockVMService.EXPECT().UpdateVM(vm, elfMachine).Return(nil, unexpectedError)
task := fake.NewTowerTask("")
withTaskVM := fake.NewWithTaskVM(vm, task)
mockVMService.EXPECT().UpdateVM(vm, elfMachine).Return(withTaskVM, nil)
reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService}
ok, err := reconciler.reconcileVMCPUAndMemory(ctx, machineCtx, vm)
Expect(ok).To(BeFalse())
Expect(err).NotTo(HaveOccurred())
Expect(logBuffer.String()).To(ContainSubstring("Waiting for the VM to be updated CPU and memory"))
expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingVMResourcesReason}})

logBuffer.Reset()
inMemoryCache.Flush()
mockVMService.EXPECT().UpdateVM(vm, elfMachine).Return(nil, unexpectedError)
reconciler = &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService}
ok, err = reconciler.reconcileVMCPUAndMemory(ctx, machineCtx, vm)
Expect(ok).To(BeFalse())
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("failed to trigger update CPU and memory for VM"))
Expect(elfMachine.Status.TaskRef).To(Equal(*task.ID))
expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.ExpandingVMResourcesFailedReason}})

logBuffer.Reset()
inMemoryCache.Flush()
task := fake.NewTowerTask("")
withTaskVM := fake.NewWithTaskVM(vm, task)
task = fake.NewTowerTask("")
withTaskVM = fake.NewWithTaskVM(vm, task)
mockVMService.EXPECT().UpdateVM(vm, elfMachine).Return(withTaskVM, nil)
reconciler = &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService}
ok, err = reconciler.reconcileVMCPUAndMemory(ctx, machineCtx, vm)
Expect(ok).To(BeFalse())
Expect(err).NotTo(HaveOccurred())
Expect(logBuffer.String()).To(ContainSubstring("Waiting for the VM to be updated CPU and memory"))
Expect(elfMachine.Status.TaskRef).To(Equal(*task.ID))
expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingVMResourcesReason}})
expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.ExpandingVMResourcesFailedReason}})
})
})
})
Expand All @@ -521,7 +534,7 @@ func newExpandRootPartitionJob(elfMachine *infrav1.ElfMachine) *agentv1.HostOper
return &agentv1.HostOperationJob{
ObjectMeta: metav1.ObjectMeta{
Name: hostagent.GetExpandRootPartitionJobName(elfMachine),
Namespace: "sks-system",
Namespace: "default",
},
Spec: agentv1.HostOperationJobSpec{
NodeName: elfMachine.Name,
Expand All @@ -540,7 +553,7 @@ func newRestartKubelet(elfMachine *infrav1.ElfMachine) *agentv1.HostOperationJob
return &agentv1.HostOperationJob{
ObjectMeta: metav1.ObjectMeta{
Name: hostagent.GetRestartKubeletJobName(elfMachine),
Namespace: "sks-system",
Namespace: "default",
},
Spec: agentv1.HostOperationJobSpec{
NodeName: elfMachine.Name,
Expand Down
23 changes: 21 additions & 2 deletions controllers/elfmachine_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3418,11 +3418,30 @@ var _ = Describe("ElfMachineReconciler", func() {
ok, err = reconciler.reconcileVMTask(ctx, machineContext, nil)
Expect(ok).Should(BeTrue())
Expect(err).ShouldNot(HaveOccurred())
expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingVMDiskFailedReason}})
expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.ExpandingVMDiskFailedReason}})

// Edit VM CPU/Memory
task.Status = models.NewTaskStatus(models.TaskStatusFAILED)
task.Description = service.TowerString("Edit VM")
task.ErrorMessage = service.TowerString(service.VMDuplicateError)
elfMachine.Status.TaskRef = *task.ID
ok, err = reconciler.reconcileVMTask(ctx, machineContext, nil)
Expect(ok).Should(BeTrue())
Expect(err).ShouldNot(HaveOccurred())
expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.TaskFailureReason}})

elfMachine.Status.TaskRef = *task.ID
elfMachine.Status.Conditions = nil
conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMResourcesReason, clusterv1.ConditionSeverityInfo, "")
ok, err = reconciler.reconcileVMTask(ctx, machineContext, nil)
Expect(ok).Should(BeTrue())
Expect(err).ShouldNot(HaveOccurred())
expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.ExpandingVMResourcesFailedReason}})

// GPU
gpuDeviceInfo := &service.GPUDeviceInfo{ID: "gpu", AllocatedCount: 0, AvailableCount: 1}
gpuDeviceInfos := []*service.GPUDeviceInfo{gpuDeviceInfo}
elfMachine.Status.Conditions = nil

tests := []struct {
description string
Expand All @@ -3447,7 +3466,7 @@ var _ = Describe("ElfMachineReconciler", func() {
ok, err = reconciler.reconcileVMTask(ctx, machineContext, nil)
Expect(ok).Should(BeTrue())
Expect(err).ShouldNot(HaveOccurred())
Expect(getGPUDevicesLockedByVM(elfCluster.Spec.Cluster, elfMachine.Name)).To(BeNil())
Expect(getGPUDevicesLockedByVM(elfCluster.Spec.Cluster, elfMachine.Name)).To(BeNil(), tc.description)
}
})
})
Expand Down
16 changes: 0 additions & 16 deletions controllers/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ import (
"github.com/pkg/errors"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
cgscheme "k8s.io/client-go/kubernetes/scheme"
"k8s.io/klog/v2"
Expand Down Expand Up @@ -141,16 +140,6 @@ func newMachineContext(
}
}

func newMachineTemplateContext(
elfCluster *infrav1.ElfCluster, cluster *clusterv1.Cluster,
emt *infrav1.ElfMachineTemplate) *context.MachineTemplateContext {
return &context.MachineTemplateContext{
Cluster: cluster,
ElfCluster: elfCluster,
ElfMachineTemplate: emt,
}
}

type conditionAssertion struct {
conditionType clusterv1.ConditionType
status corev1.ConditionStatus
Expand All @@ -169,8 +158,3 @@ func expectConditions(getter conditions.Getter, expected []conditionAssertion) {
Expect(actual.Reason).To(Equal(c.reason))
}
}

func intOrStrPtr(i int32) *intstr.IntOrString {
res := intstr.FromInt(int(i))
return &res
}
3 changes: 2 additions & 1 deletion pkg/service/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,8 @@ func IsUpdateVMTask(task *models.Task) bool {
}

func IsUpdateVMDiskTask(task *models.Task, vmName string) bool {
return GetTowerString(task.Description) == fmt.Sprintf("Edit VM %s disk", vmName)
return GetTowerString(task.Description) == fmt.Sprintf("Edit VM %s disk", vmName) ||
strings.Contains(GetTowerString(task.Description), "Update virtual volume")
}

func IsVMColdMigrationTask(task *models.Task) bool {
Expand Down
2 changes: 1 addition & 1 deletion webhooks/elfmachinetemplate_webhook_validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ func (v *ElfMachineTemplateValidator) ValidateCreate(ctx goctx.Context, obj runt
}

if elfMachineTemplate.Spec.Template.Spec.NumCoresPerSocket <= 0 {
allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "numCoresPerSocket"), elfMachineTemplate.Spec.Template.Spec.NumCPUs, numCoresPerSocketCannotLessThanZeroMsg))
allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "numCoresPerSocket"), elfMachineTemplate.Spec.Template.Spec.NumCoresPerSocket, numCoresPerSocketCannotLessThanZeroMsg))
}

return nil, aggregateObjErrors(elfMachineTemplate.GroupVersionKind().GroupKind(), elfMachineTemplate.Name, allErrs)
Expand Down
Loading

0 comments on commit a9903bc

Please sign in to comment.