diff --git a/api/v1beta1/conditions_consts.go b/api/v1beta1/conditions_consts.go index 0f53552e..29392663 100644 --- a/api/v1beta1/conditions_consts.go +++ b/api/v1beta1/conditions_consts.go @@ -76,6 +76,16 @@ const ( // are automatically re-tried by the controller. UpdatingFailedReason = "UpdatingFailed" + // DetachingGPUFailedReason (Severity=Warning) documents an ElfMachine controller detecting + // an error while detaching GPU devices; those kind of errors are usually transient and failed provisioning + // are automatically re-tried by the controller. + DetachingGPUFailedReason = "DetachingGPUFailed" + + // AttachingGPUFailedReason (Severity=Warning) documents an ElfMachine controller detecting + // an error while attaching GPU devices; those kind of errors are usually transient and failed provisioning + // are automatically re-tried by the controller. + AttachingGPUFailedReason = "AttachingGPUFailed" + // TaskFailureReason (Severity=Warning) documents an ElfMachine task failure; the reconcile look will automatically // retry the operation, but a user intervention might be required to fix the problem. TaskFailureReason = "TaskFailure" @@ -95,6 +105,10 @@ const ( // WaitingForAvailableHostRequiredByPlacementGroupReason (Severity=Info) documents an ElfMachine // waiting for an available host required by placement group to create VM. WaitingForAvailableHostRequiredByPlacementGroupReason = "WaitingForAvailableHostRequiredByPlacementGroup" + + // WaitingForAvailableHostWithEnoughGPUsReason (Severity=Info) documents an ElfMachine + // waiting for an available host with enough GPUs to create VM. + WaitingForAvailableHostWithEnoughGPUsReason = "WaitingForAvailableHostWithEnoughGPUs" ) // Conditions and Reasons related to make connections to a Tower. Can currently be used by ElfCluster and ElfMachine diff --git a/api/v1beta1/elfmachine_types.go b/api/v1beta1/elfmachine_types.go index 0a3570d6..0c046f4b 100644 --- a/api/v1beta1/elfmachine_types.go +++ b/api/v1beta1/elfmachine_types.go @@ -75,6 +75,14 @@ type ElfMachineSpec struct { // +optional DiskGiB int32 `json:"diskGiB,omitempty"` + // GPUDevices is the list of GPUs used by the virtual machine. + // +optional + GPUDevices []GPUPassthroughDeviceSpec `json:"gpuDevices,omitempty"` + + // VGPUDevices is the list of vGPUs used by the virtual machine. + // +optional + VGPUDevices []VGPUDeviceSpec `json:"vgpuDevices,omitempty"` + // +optional HA bool `json:"ha,omitempty"` @@ -106,6 +114,11 @@ type ElfMachineStatus struct { // +optional Network []NetworkStatus `json:"network,omitempty"` + // GPUDevices returns the GPU devices status for each of the machine's configured + // GPU devices. + // +optional + GPUDevices []GPUStatus `json:"gpuDevices,omitempty"` + // FailureReason will be set in the event that there is a terminal problem // reconciling the Machine and will contain a succinct value suitable // for machine interpretation. @@ -300,6 +313,10 @@ func (m *ElfMachine) GetVMDisconnectionTimestamp() *metav1.Time { return nil } +func (m *ElfMachine) RequiresGPUDevices() bool { + return len(m.Spec.GPUDevices) > 0 || len(m.Spec.VGPUDevices) > 0 +} + //+kubebuilder:object:root=true // ElfMachineList contains a list of ElfMachine. diff --git a/api/v1beta1/types.go b/api/v1beta1/types.go index de4b1f6d..23633bbd 100644 --- a/api/v1beta1/types.go +++ b/api/v1beta1/types.go @@ -146,6 +146,37 @@ type NetworkDeviceRouteSpec struct { Network string `json:"network,omitempty"` } +// GPUPassthroughDeviceSpec defines virtual machine's GPU configuration +type GPUPassthroughDeviceSpec struct { + // Model is the model name of a physical GPU, e.g. 'A16'. + Model string `json:"model,omitempty"` + + // Count is the number of GPU. Defaults to 1. + // +optional + // +kubebuilder:default=1 + // +kubebuilder:validation:Minimum=1 + Count int32 `json:"count,omitempty"` +} + +// VGPUDeviceSpec defines virtual machine's VGPU configuration +type VGPUDeviceSpec struct { + // Type is the type name of a virtual GPU, e.g. 'NVIDIA A16-16A'. + // +kubebuilder:validation:Required + Type string `json:"type,omitempty"` + + // Count is the number of vGPU. Defaults to 1. + // +optional + // +kubebuilder:default=1 + // +kubebuilder:validation:Minimum=1 + Count int32 `json:"count,omitempty"` +} + +// GPUStatus provides information about one of a VM's GPU device. +type GPUStatus struct { + GPUID string `json:"gpuId,omitempty"` + Name string `json:"name,omitempty"` +} + //+kubebuilder:object:generate=false // PatchStringValue is for patching resources. diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index a689303f..d1f535c0 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -198,6 +198,16 @@ func (in *ElfMachineSpec) DeepCopyInto(out *ElfMachineSpec) { **out = **in } in.Network.DeepCopyInto(&out.Network) + if in.GPUDevices != nil { + in, out := &in.GPUDevices, &out.GPUDevices + *out = make([]GPUPassthroughDeviceSpec, len(*in)) + copy(*out, *in) + } + if in.VGPUDevices != nil { + in, out := &in.VGPUDevices, &out.VGPUDevices + *out = make([]VGPUDeviceSpec, len(*in)) + copy(*out, *in) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ElfMachineSpec. @@ -232,6 +242,11 @@ func (in *ElfMachineStatus) DeepCopyInto(out *ElfMachineStatus) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.GPUDevices != nil { + in, out := &in.GPUDevices, &out.GPUDevices + *out = make([]GPUStatus, len(*in)) + copy(*out, *in) + } if in.FailureReason != nil { in, out := &in.FailureReason, &out.FailureReason *out = new(errors.MachineStatusError) @@ -344,6 +359,36 @@ func (in *ElfMachineTemplateSpec) DeepCopy() *ElfMachineTemplateSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUPassthroughDeviceSpec) DeepCopyInto(out *GPUPassthroughDeviceSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUPassthroughDeviceSpec. +func (in *GPUPassthroughDeviceSpec) DeepCopy() *GPUPassthroughDeviceSpec { + if in == nil { + return nil + } + out := new(GPUPassthroughDeviceSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUStatus) DeepCopyInto(out *GPUStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUStatus. +func (in *GPUStatus) DeepCopy() *GPUStatus { + if in == nil { + return nil + } + out := new(GPUStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NetworkDeviceRouteSpec) DeepCopyInto(out *NetworkDeviceRouteSpec) { *out = *in @@ -445,3 +490,18 @@ func (in *Tower) DeepCopy() *Tower { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *VGPUDeviceSpec) DeepCopyInto(out *VGPUDeviceSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VGPUDeviceSpec. +func (in *VGPUDeviceSpec) DeepCopy() *VGPUDeviceSpec { + if in == nil { + return nil + } + out := new(VGPUDeviceSpec) + in.DeepCopyInto(out) + return out +} diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfclusters.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfclusters.yaml index aa65e539..6cbaae13 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfclusters.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfclusters.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.11.4 + controller-gen.kubebuilder.io/version: v0.12.0 name: elfclusters.infrastructure.cluster.x-k8s.io spec: group: infrastructure.cluster.x-k8s.io diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachines.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachines.yaml index 5d9c58a3..ff18ca28 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachines.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachines.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.11.4 + controller-gen.kubebuilder.io/version: v0.12.0 name: elfmachines.infrastructure.cluster.x-k8s.io spec: group: infrastructure.cluster.x-k8s.io @@ -76,6 +76,24 @@ spec: this infrastructure provider, the name is equivalent to the name of the ElfDeploymentZone. type: string + gpuDevices: + description: GPUDevices is the list of GPUs used by the virtual machine. + items: + description: GPUPassthroughDeviceSpec defines virtual machine's + GPU configuration + properties: + count: + default: 1 + description: Count is the number of GPU. Defaults to 1. + format: int32 + minimum: 1 + type: integer + model: + description: Model is the model name of a physical GPU, e.g. + 'A16'. + type: string + type: object + type: array ha: type: boolean host: @@ -183,6 +201,24 @@ spec: description: Template is the name or ID of the template used to clone new machines. type: string + vgpuDevices: + description: VGPUDevices is the list of vGPUs used by the virtual + machine. + items: + description: VGPUDeviceSpec defines virtual machine's VGPU configuration + properties: + count: + default: 1 + description: Count is the number of vGPU. Defaults to 1. + format: int32 + minimum: 1 + type: integer + type: + description: Type is the type name of a virtual GPU, e.g. 'NVIDIA + A16-16A'. + type: string + type: object + type: array required: - template type: object @@ -282,6 +318,19 @@ spec: during the reconciliation of Machines can be added as events to the Machine object and/or logged in the controller's output." type: string + gpuDevices: + description: GPUDevices returns the GPU devices status for each of + the machine's configured GPU devices. + items: + description: GPUStatus provides information about one of a VM's + GPU device. + properties: + gpuId: + type: string + name: + type: string + type: object + type: array hostServerName: description: HostServerName is the name of host server where the virtual machine runs on. This value is set automatically at runtime and diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachinetemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachinetemplates.yaml index 114e614e..811567f0 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachinetemplates.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachinetemplates.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.11.4 + controller-gen.kubebuilder.io/version: v0.12.0 name: elfmachinetemplates.infrastructure.cluster.x-k8s.io spec: group: infrastructure.cluster.x-k8s.io @@ -56,6 +56,26 @@ spec: API. For this infrastructure provider, the name is equivalent to the name of the ElfDeploymentZone. type: string + gpuDevices: + description: GPUDevices is the list of GPUs used by the virtual + machine. + items: + description: GPUPassthroughDeviceSpec defines virtual machine's + GPU configuration + properties: + count: + default: 1 + description: Count is the number of GPU. Defaults to + 1. + format: int32 + minimum: 1 + type: integer + model: + description: Model is the model name of a physical GPU, + e.g. 'A16'. + type: string + type: object + type: array ha: type: boolean host: @@ -165,6 +185,26 @@ spec: description: Template is the name or ID of the template used to clone new machines. type: string + vgpuDevices: + description: VGPUDevices is the list of vGPUs used by the + virtual machine. + items: + description: VGPUDeviceSpec defines virtual machine's VGPU + configuration + properties: + count: + default: 1 + description: Count is the number of vGPU. Defaults to + 1. + format: int32 + minimum: 1 + type: integer + type: + description: Type is the type name of a virtual GPU, + e.g. 'NVIDIA A16-16A'. + type: string + type: object + type: array required: - template type: object diff --git a/controllers/elfmachine_controller.go b/controllers/elfmachine_controller.go index f02cfa82..65bcbfd2 100644 --- a/controllers/elfmachine_controller.go +++ b/controllers/elfmachine_controller.go @@ -302,6 +302,16 @@ func (r *ElfMachineReconciler) reconcileDelete(ctx *context.MachineContext) (rec conditions.MarkFalse(ctx.ElfMachine, infrav1.VMProvisionedCondition, clusterv1.DeletingReason, clusterv1.ConditionSeverityInfo, "") + defer func() { + // When deleting a virtual machine, the GPU device + // locked by the virtual machine may not be unlocked. + // For example, the Cluster or ElfMachine was deleted during a pause. + if !ctrlutil.ContainsFinalizer(ctx.ElfMachine, infrav1.MachineFinalizer) && + ctx.ElfMachine.RequiresGPUDevices() { + unlockGPUDevicesLockedByVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name) + } + }() + if ok, err := r.deletePlacementGroup(ctx); err != nil { return reconcile.Result{}, err } else if !ok { @@ -470,6 +480,8 @@ func (r *ElfMachineReconciler) reconcileNormal(ctx *context.MachineContext) (rec // The return bool value: // 1. true means that the VM is running and joined a placement group (if needed). // 2. false and error is nil means the VM is not running or wait to join the placement group. +// +//nolint:gocyclo func (r *ElfMachineReconciler) reconcileVM(ctx *context.MachineContext) (*models.VM, bool, error) { // If there is no vmRef then no VM exists, create one if !ctx.ElfMachine.HasVM() { @@ -507,14 +519,24 @@ func (r *ElfMachineReconciler) reconcileVM(ctx *context.MachineContext) (*models return nil, false, nil } - hostID, err := r.preCheckPlacementGroup(ctx) - if err != nil || hostID == nil { - return nil, false, err + var hostID *string + var gpuDevices []*models.GpuDevice + // The virtual machine of the Control Plane does not support GPU Devices. + if machineutil.IsControlPlaneMachine(ctx.Machine) { + hostID, err = r.preCheckPlacementGroup(ctx) + if err != nil || hostID == nil { + return nil, false, err + } + } else { + hostID, gpuDevices, err = r.selectHostAndGPUsForVM(ctx, "") + if err != nil || hostID == nil { + return nil, false, err + } } ctx.Logger.Info("Create VM for ElfMachine") - withTaskVM, err := ctx.VMService.Clone(ctx.ElfCluster, ctx.Machine, ctx.ElfMachine, bootstrapData, *hostID) + withTaskVM, err := ctx.VMService.Clone(ctx.ElfCluster, ctx.ElfMachine, bootstrapData, *hostID, gpuDevices) if err != nil { releaseTicketForCreateVM(ctx.ElfMachine.Name) @@ -526,6 +548,11 @@ func (r *ElfMachineReconciler) reconcileVM(ctx *context.MachineContext) (*models ctx.ElfMachine.SetVM(util.GetVMRef(vm)) } else { + // Duplicate VM error does not require unlocking GPU devices. + if ctx.ElfMachine.RequiresGPUDevices() { + unlockGPUDevicesLockedByVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name) + } + ctx.Logger.Error(err, "failed to create VM", "vmRef", ctx.ElfMachine.Status.VMRef, "taskRef", ctx.ElfMachine.Status.TaskRef) @@ -595,6 +622,10 @@ func (r *ElfMachineReconciler) reconcileVM(ctx *context.MachineContext) (*models return vm, false, err } + if ok, err := r.reconcileGPUDevices(ctx, vm); err != nil || !ok { + return vm, false, err + } + if ok, err := r.reconcileVMStatus(ctx, vm); err != nil || !ok { return vm, false, err } @@ -844,6 +875,9 @@ func (r *ElfMachineReconciler) reconcileVMTask(ctx *context.MachineContext, vm * switch *task.Status { case models.TaskStatusFAILED: errorMessage := service.GetTowerString(task.ErrorMessage) + if service.IsGPUAssignFailed(errorMessage) { + errorMessage = service.ParseGPUAssignFailed(errorMessage) + } conditions.MarkFalse(ctx.ElfMachine, infrav1.VMProvisionedCondition, infrav1.TaskFailureReason, clusterv1.ConditionSeverityInfo, errorMessage) if service.IsCloudInitConfigError(errorMessage) { @@ -860,6 +894,14 @@ func (r *ElfMachineReconciler) reconcileVMTask(ctx *context.MachineContext, vm * if service.IsVMDuplicateError(errorMessage) { setVMDuplicate(ctx.ElfMachine.Name) } + + if ctx.ElfMachine.RequiresGPUDevices() { + unlockGPUDevicesLockedByVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name) + } + case service.IsPowerOnVMTask(task) || service.IsUpdateVMTask(task): + if ctx.ElfMachine.RequiresGPUDevices() { + unlockGPUDevicesLockedByVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name) + } case service.IsMemoryInsufficientError(errorMessage): recordElfClusterMemoryInsufficient(ctx, true) message := fmt.Sprintf("Insufficient memory detected for the ELF cluster %s", ctx.ElfCluster.Spec.Cluster) @@ -878,9 +920,16 @@ func (r *ElfMachineReconciler) reconcileVMTask(ctx *context.MachineContext, vm * case models.TaskStatusSUCCESSED: ctx.Logger.Info("VM task succeeded", "vmRef", vmRef, "taskRef", taskRef, "taskDescription", service.GetTowerString(task.Description)) + if service.IsCloneVMTask(task) || service.IsUpdateVMTask(task) { + if ctx.ElfMachine.RequiresGPUDevices() { + unlockGPUDevicesLockedByVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name) + } + } + if service.IsCloneVMTask(task) || service.IsPowerOnVMTask(task) { releaseTicketForCreateVM(ctx.ElfMachine.Name) recordElfClusterMemoryInsufficient(ctx, false) + if err := recordPlacementGroupPolicyNotSatisfied(ctx, false); err != nil { return true, err } diff --git a/controllers/elfmachine_controller_gpu.go b/controllers/elfmachine_controller_gpu.go new file mode 100644 index 00000000..cc264fc0 --- /dev/null +++ b/controllers/elfmachine_controller_gpu.go @@ -0,0 +1,316 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "github.com/pkg/errors" + "github.com/smartxworks/cloudtower-go-sdk/v2/models" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/utils/pointer" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/util/conditions" + + infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" + "github.com/smartxworks/cluster-api-provider-elf/pkg/context" + "github.com/smartxworks/cluster-api-provider-elf/pkg/service" +) + +// selectHostAndGPUsForVM returns the host running the virtual machine +// and the GPU devices allocated to the virtual machine. +// +// By default, randomly select a host from the available hosts +// that meets the GPU requirements of the virtual machine. +// If preferredHostID is specified, the specified host will be given priority +// if it meets the GPU requirements. +// +// The return rethost: +// 1. nil means there are not enough hosts. +// 2. An empty string indicates that the host does not need to be specified. +// 3. A non-empty string indicates that the specified host ID was returned. +// +// The return gpudevices: the GPU devices for virtual machine. +func (r *ElfMachineReconciler) selectHostAndGPUsForVM(ctx *context.MachineContext, preferredHostID string) (rethost *string, gpudevices []*models.GpuDevice, reterr error) { + if !ctx.ElfMachine.RequiresGPUDevices() { + return pointer.String(""), nil, nil + } + + defer func() { + if rethost == nil { + conditions.MarkFalse(ctx.ElfMachine, infrav1.VMProvisionedCondition, infrav1.WaitingForAvailableHostWithEnoughGPUsReason, clusterv1.ConditionSeverityInfo, "") + + ctx.Logger.V(1).Info("No host with the required GPU devices for the virtual machine, so wait for enough available hosts") + } + }() + + // If the GPU devices locked by the virtual machine still exist, use them directly. + if lockedVMGPUs := getGPUDevicesLockedByVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name); lockedVMGPUs != nil { + if ok, gpuDevices, err := r.checkGPUsCanBeUsedForVM(ctx, lockedVMGPUs.GPUDeviceIDs, ctx.ElfMachine.Name); err != nil { + return nil, nil, err + } else if ok { + ctx.Logger.V(1).Info("Found locked VM GPU devices, so skip allocation", "lockedVMGPUs", lockedVMGPUs) + + return &lockedVMGPUs.HostID, gpuDevices, nil + } + + // If the GPU devices returned by Tower is inconsistent with the locked GPU, + // delete the locked GPU devices and reallocate. + ctx.Logger.V(1).Info("Locked VM GPU devices are invalid, so remove and reallocate", "lockedVMGPUs", lockedVMGPUs) + + unlockGPUDevicesLockedByVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name) + } + + hosts, err := ctx.VMService.GetHostsByCluster(ctx.ElfCluster.Spec.Cluster) + if err != nil { + return nil, nil, err + } + + availableHosts := hosts.FilterAvailableHostsWithEnoughMemory(*service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB)) + if len(availableHosts) == 0 { + return nil, nil, nil + } + + // Get all GPU devices of available hosts. + gpuDevices, err := ctx.VMService.FindGPUDevicesByHostIDs(availableHosts.IDs()) + if err != nil { + return nil, nil, err + } + + lockedClusterGPUIDs := getLockedClusterGPUIDs(ctx.ElfCluster.Spec.Cluster) + + // Group GPU devices by host. + hostGPUDeviceMap := make(map[string][]*models.GpuDevice) + hostIDSet := sets.NewString() + for i := 0; i < len(gpuDevices); i++ { + // Filter already used or locked GPU devices. + if !service.GPUCanBeUsedForVM(gpuDevices[i], ctx.ElfMachine.Name) || + lockedClusterGPUIDs.Has(*gpuDevices[i].ID) { + continue + } + + hostIDSet.Insert(*gpuDevices[i].Host.ID) + if gpus, ok := hostGPUDeviceMap[*gpuDevices[i].Host.ID]; !ok { + hostGPUDeviceMap[*gpuDevices[i].Host.ID] = []*models.GpuDevice{gpuDevices[i]} + } else { + hostGPUDeviceMap[*gpuDevices[i].Host.ID] = append(gpus, gpuDevices[i]) + } + } + + // Choose a host that meets ElfMachine GPU needs. + // Use a random host list to reduce the probability of the same host being selected at the same time. + var unsortedHostIDs []string + if hostIDSet.Has(preferredHostID) { + hostIDSet.Delete(preferredHostID) + // Prioritize the preferred host + unsortedHostIDs = append(unsortedHostIDs, preferredHostID) + unsortedHostIDs = append(unsortedHostIDs, hostIDSet.UnsortedList()...) + } else { + unsortedHostIDs = hostIDSet.UnsortedList() + } + + for i := 0; i < len(unsortedHostIDs); i++ { + if hostGPUDevices, ok := hostGPUDeviceMap[unsortedHostIDs[i]]; ok { + selectedGPUDevices := selectGPUDevicesForVM(hostGPUDevices, ctx.ElfMachine.Spec.GPUDevices) + if len(selectedGPUDevices) > 0 { + gpuDeviceIDs := make([]string, len(selectedGPUDevices)) + for i := 0; i < len(selectedGPUDevices); i++ { + gpuDeviceIDs[i] = *selectedGPUDevices[i].ID + } + + // Lock the selected GPU devices to prevent it from being allocated to multiple virtual machines. + if !lockGPUDevicesForVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name, unsortedHostIDs[i], gpuDeviceIDs) { + // Lock failure indicates that the GPU devices are locked by another virtual machine. + // Just trying other hosts. + continue + } + + ctx.Logger.Info("Selected host and GPU devices for VM", "hostId", unsortedHostIDs[i], "gpuDeviceIds", gpuDeviceIDs) + + return &unsortedHostIDs[i], selectedGPUDevices, nil + } + } + } + + return nil, nil, nil +} + +// selectGPUDevicesForVM selects the GPU devices required by the virtual machine from the host's GPU devices. +// Empty GPU devices indicates that the host's GPU devices cannot meet the GPU requirements of the virtual machine. +func selectGPUDevicesForVM(hostGPUDevices []*models.GpuDevice, requiredGPUDevices []infrav1.GPUPassthroughDeviceSpec) []*models.GpuDevice { + // Group GPU devices by model. + modelGPUDeviceMap := make(map[string][]*models.GpuDevice) + for i := 0; i < len(hostGPUDevices); i++ { + if gpus, ok := modelGPUDeviceMap[*hostGPUDevices[i].Model]; !ok { + modelGPUDeviceMap[*hostGPUDevices[i].Model] = []*models.GpuDevice{hostGPUDevices[i]} + } else { + modelGPUDeviceMap[*hostGPUDevices[i].Model] = append(gpus, hostGPUDevices[i]) + } + } + + var selectedGPUDevices []*models.GpuDevice + for i := 0; i < len(requiredGPUDevices); i++ { + if gpus, ok := modelGPUDeviceMap[requiredGPUDevices[i].Model]; !ok { + return nil + } else { + if len(gpus) < int(requiredGPUDevices[i].Count) { + return nil + } + + selectedGPUDevices = append(selectedGPUDevices, gpus[:int(requiredGPUDevices[i].Count)]...) + // Remove selected GPU devices. + modelGPUDeviceMap[requiredGPUDevices[i].Model] = gpus[int(requiredGPUDevices[i].Count):] + } + } + + return selectedGPUDevices +} + +// reconcileGPUDevices ensures that the virtual machine has the expected GPU devices. +func (r *ElfMachineReconciler) reconcileGPUDevices(ctx *context.MachineContext, vm *models.VM) (bool, error) { + if !ctx.ElfMachine.RequiresGPUDevices() { + return true, nil + } + + // Ensure GPUStatus is set or up to date. + gpuDevices := make([]infrav1.GPUStatus, len(vm.GpuDevices)) + for i := 0; i < len(vm.GpuDevices); i++ { + gpuDevices[i] = infrav1.GPUStatus{GPUID: *vm.GpuDevices[i].ID, Name: *vm.GpuDevices[i].Name} + } + ctx.ElfMachine.Status.GPUDevices = gpuDevices + + if *vm.Status != models.VMStatusSTOPPED { + return true, nil + } + + // GPU devices has been removed, need to select GPU devices. + if len(vm.GpuDevices) == 0 { + return r.addGPUDevicesForVM(ctx, vm) + } + + // If the GPU devices are already in use, remove the GPU devices first and then reselect the new GPU devices. + message := conditions.GetMessage(ctx.ElfMachine, infrav1.VMProvisionedCondition) + if service.IsGPUAssignFailed(message) { + ctx.Logger.Info("GPU devices of the host are not sufficient and the virtual machine cannot be started, so remove the GPU devices and reallocate.") + + return false, r.removeVMGPUDevices(ctx, vm) + } + + gpuIDs := make([]string, len(vm.GpuDevices)) + for i := 0; i < len(vm.GpuDevices); i++ { + gpuIDs[i] = *vm.GpuDevices[i].ID + } + + if ok, _, err := r.checkGPUsCanBeUsedForVM(ctx, gpuIDs, ctx.ElfMachine.Name); err != nil { + return false, err + } else if !ok { + // If the GPU devices are already in use, + // remove the GPU devices first and then reallocate the new GPU devices. + ctx.Logger.V(1).Info("GPU devices of VM are already in use, so remove and reallocate", "gpuIDs", gpuIDs) + + return false, r.removeVMGPUDevices(ctx, vm) + } + + return true, nil +} + +// addGPUDevicesForVM adds expected GPU devices to the virtual machine. +func (r *ElfMachineReconciler) addGPUDevicesForVM(ctx *context.MachineContext, vm *models.VM) (bool, error) { + hostID, gpuDevices, err := r.selectHostAndGPUsForVM(ctx, *vm.Host.ID) + if err != nil || hostID == nil { + return false, err + } + + if *vm.Host.ID != *hostID { + ctx.Logger.Info("The current host does not have enough GPU devices, the virtual machine needs to be migrated to a host that meets the GPU device requirements.", "currentHost", *vm.Host.ID, "targetHost", *hostID) + + ok, err := r.migrateVM(ctx, vm, *hostID) + if err != nil { + unlockGPUDevicesLockedByVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name) + } + + return ok, err + } + + gpus := make([]*models.VMGpuOperationParams, len(gpuDevices)) + for i := 0; i < len(gpuDevices); i++ { + gpus[i] = &models.VMGpuOperationParams{ + GpuID: gpuDevices[i].ID, + Amount: service.TowerInt32(1), + } + } + + task, err := ctx.VMService.AddGPUDevices(ctx.ElfMachine.Status.VMRef, gpus) + if err != nil { + conditions.MarkFalse(ctx.ElfMachine, infrav1.VMProvisionedCondition, infrav1.AttachingGPUFailedReason, clusterv1.ConditionSeverityWarning, err.Error()) + + unlockGPUDevicesLockedByVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name) + + return false, errors.Wrapf(err, "failed to trigger attaching GPU devices for VM %s", ctx) + } + + conditions.MarkFalse(ctx.ElfMachine, infrav1.VMProvisionedCondition, infrav1.UpdatingReason, clusterv1.ConditionSeverityInfo, "") + + ctx.ElfMachine.SetTask(*task.ID) + + ctx.Logger.Info("Waiting for VM to attach GPU devices", "vmRef", ctx.ElfMachine.Status.VMRef, "taskRef", ctx.ElfMachine.Status.TaskRef) + + return false, nil +} + +// removeVMGPUDevices removes all GPU devices from the virtual machine. +func (r *ElfMachineReconciler) removeVMGPUDevices(ctx *context.MachineContext, vm *models.VM) error { + staleGPUs := make([]*models.VMGpuOperationParams, len(vm.GpuDevices)) + for i := 0; i < len(vm.GpuDevices); i++ { + staleGPUs[i] = &models.VMGpuOperationParams{ + GpuID: vm.GpuDevices[i].ID, + Amount: service.TowerInt32(1), + } + } + + task, err := ctx.VMService.RemoveGPUDevices(ctx.ElfMachine.Status.VMRef, staleGPUs) + if err != nil { + conditions.MarkFalse(ctx.ElfMachine, infrav1.VMProvisionedCondition, infrav1.DetachingGPUFailedReason, clusterv1.ConditionSeverityWarning, err.Error()) + + return errors.Wrapf(err, "failed to trigger detaching stale GPU devices for VM %s", ctx) + } + + conditions.MarkFalse(ctx.ElfMachine, infrav1.VMProvisionedCondition, infrav1.UpdatingReason, clusterv1.ConditionSeverityInfo, "") + + ctx.ElfMachine.SetTask(*task.ID) + + ctx.Logger.Info("Waiting for VM to be removed stale GPU devices", "vmRef", ctx.ElfMachine.Status.VMRef, "taskRef", ctx.ElfMachine.Status.TaskRef) + + return nil +} + +// checkGPUsCanBeUsedForVM checks whether GPU devices can be used by the specified virtual machine. +// The return true means the GPU devices can be used for the virtual machine. +func (r *ElfMachineReconciler) checkGPUsCanBeUsedForVM(ctx *context.MachineContext, gpuDeviceIDs []string, vm string) (bool, []*models.GpuDevice, error) { + gpuDevices, err := ctx.VMService.FindGPUDevicesByIDs(gpuDeviceIDs) + if err != nil { + return false, nil, err + } + + if len(gpuDevices) != len(gpuDeviceIDs) { + return false, nil, nil + } + + if len(service.FilterOutGPUsCanNotBeUsedForVM(gpuDevices, vm)) != len(gpuDeviceIDs) { + return false, nil, nil + } + + return true, gpuDevices, nil +} diff --git a/controllers/elfmachine_controller_gpu_test.go b/controllers/elfmachine_controller_gpu_test.go new file mode 100644 index 00000000..01483c3f --- /dev/null +++ b/controllers/elfmachine_controller_gpu_test.go @@ -0,0 +1,361 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "bytes" + goctx "context" + + "github.com/go-logr/logr" + "github.com/golang/mock/gomock" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/pkg/errors" + "github.com/smartxworks/cloudtower-go-sdk/v2/models" + corev1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/util/conditions" + + infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" + "github.com/smartxworks/cluster-api-provider-elf/pkg/service" + "github.com/smartxworks/cluster-api-provider-elf/pkg/service/mock_services" + "github.com/smartxworks/cluster-api-provider-elf/test/fake" +) + +var _ = Describe("ElfMachineReconciler-GPU", func() { + var ( + elfCluster *infrav1.ElfCluster + cluster *clusterv1.Cluster + elfMachine *infrav1.ElfMachine + machine *clusterv1.Machine + md *clusterv1.MachineDeployment + secret *corev1.Secret + logBuffer *bytes.Buffer + mockCtrl *gomock.Controller + mockVMService *mock_services.MockVMService + mockNewVMService service.NewVMServiceFunc + ) + + gpuModel := "A16" + unexpectedError := errors.New("unexpected error") + + BeforeEach(func() { + logBuffer = new(bytes.Buffer) + klog.SetOutput(logBuffer) + + elfCluster, cluster, elfMachine, machine, secret = fake.NewClusterAndMachineObjects() + md = fake.NewMD() + fake.ToWorkerMachine(machine, md) + fake.ToWorkerMachine(elfMachine, md) + + // mock + mockCtrl = gomock.NewController(GinkgoT()) + mockVMService = mock_services.NewMockVMService(mockCtrl) + mockNewVMService = func(_ goctx.Context, _ infrav1.Tower, _ logr.Logger) (service.VMService, error) { + return mockVMService, nil + } + }) + + AfterEach(func() { + mockCtrl.Finish() + }) + + Context("selectHostAndGPUsForVM", func() { + + BeforeEach(func() { + elfMachine.Spec.GPUDevices = append(elfMachine.Spec.GPUDevices, infrav1.GPUPassthroughDeviceSpec{Model: gpuModel, Count: 1}) + }) + + It("should not handle ElfMachine without GPU", func() { + elfMachine.Spec.GPUDevices = nil + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + + machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + host, gpus, err := reconciler.selectHostAndGPUsForVM(machineContext, "") + Expect(err).NotTo(HaveOccurred()) + Expect(*host).To(BeEmpty()) + Expect(gpus).To(BeEmpty()) + }) + + It("should check and use locked GPUs", func() { + host := fake.NewTowerHost() + gpu := fake.NewTowerGPU() + gpu.Host = &models.NestedHost{ID: host.ID} + gpu.Model = service.TowerString(gpuModel) + gpuIDs := []string{*gpu.ID} + gpusDevices := []*models.GpuDevice{gpu} + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host), nil) + mockVMService.EXPECT().FindGPUDevicesByHostIDs([]string{*host.ID}).Return(gpusDevices, nil) + + machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + hostID, gpus, err := reconciler.selectHostAndGPUsForVM(machineContext, "") + Expect(err).NotTo(HaveOccurred()) + Expect(*hostID).To(Equal(*host.ID)) + Expect(gpus).To(Equal(gpusDevices)) + + mockVMService.EXPECT().FindGPUDevicesByIDs(gpuIDs).Return(gpusDevices, nil) + hostID, gpus, err = reconciler.selectHostAndGPUsForVM(machineContext, "") + Expect(err).NotTo(HaveOccurred()) + Expect(*hostID).To(Equal(*host.ID)) + Expect(gpus).To(Equal(gpusDevices)) + Expect(logBuffer.String()).To(ContainSubstring("Found locked VM GPU devices")) + + logBuffer.Reset() + gpu.Vms = []*models.NestedVM{{ID: service.TowerString("id"), Name: service.TowerString("vm")}} + mockVMService.EXPECT().FindGPUDevicesByIDs(gpuIDs).Return(gpusDevices, nil) + mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host), nil) + mockVMService.EXPECT().FindGPUDevicesByHostIDs([]string{*host.ID}).Return(gpusDevices, nil) + hostID, gpus, err = reconciler.selectHostAndGPUsForVM(machineContext, "") + Expect(err).NotTo(HaveOccurred()) + Expect(hostID).To(BeNil()) + Expect(gpus).To(BeEmpty()) + Expect(logBuffer.String()).To(ContainSubstring("Locked VM GPU devices are invalid")) + Expect(logBuffer.String()).To(ContainSubstring("No host with the required GPU devices for the virtual machine")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.WaitingForAvailableHostWithEnoughGPUsReason}}) + Expect(getGPUDevicesLockedByVM(elfCluster.Spec.Cluster, elfMachine.Name)).To(BeNil()) + }) + + It("should prioritize the preferred host", func() { + host := fake.NewTowerHost() + gpu := fake.NewTowerGPU() + gpu.Host = &models.NestedHost{ID: host.ID} + gpu.Model = service.TowerString(gpuModel) + preferredHost := fake.NewTowerHost() + preferredGPU := fake.NewTowerGPU() + preferredGPU.Host = &models.NestedHost{ID: preferredHost.ID} + preferredGPU.Model = service.TowerString(gpuModel) + gpusDevices := []*models.GpuDevice{gpu, preferredGPU} + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host, preferredHost), nil) + mockVMService.EXPECT().FindGPUDevicesByHostIDs(gomock.InAnyOrder([]string{*host.ID, *preferredHost.ID})).Return(gpusDevices, nil) + + machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + hostID, gpus, err := reconciler.selectHostAndGPUsForVM(machineContext, *preferredHost.ID) + Expect(err).NotTo(HaveOccurred()) + Expect(*hostID).To(Equal(*preferredHost.ID)) + Expect(gpus).To(Equal([]*models.GpuDevice{preferredGPU})) + }) + }) + + Context("reconcileGPUDevices", func() { + BeforeEach(func() { + elfMachine.Spec.GPUDevices = append(elfMachine.Spec.GPUDevices, infrav1.GPUPassthroughDeviceSpec{Model: gpuModel, Count: 1}) + }) + + It("should not handle ElfMachine without GPU", func() { + elfMachine.Spec.GPUDevices = nil + vm := fake.NewTowerVMFromElfMachine(elfMachine) + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + + machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + ok, err := reconciler.reconcileGPUDevices(machineContext, vm) + Expect(err).NotTo(HaveOccurred()) + Expect(ok).To(BeTrue()) + }) + + It("should set .Status.GPUDevices when the virtual machine is not powered off", func() { + vm := fake.NewTowerVMFromElfMachine(elfMachine) + vm.Status = models.NewVMStatus(models.VMStatusRUNNING) + vm.GpuDevices = []*models.NestedGpuDevice{{ID: service.TowerString(fake.ID()), Name: service.TowerString(fake.ID())}} + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + + machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + ok, err := reconciler.reconcileGPUDevices(machineContext, vm) + Expect(err).NotTo(HaveOccurred()) + Expect(ok).To(BeTrue()) + Expect(elfMachine.Status.GPUDevices).To(Equal([]infrav1.GPUStatus{{GPUID: *vm.GpuDevices[0].ID, Name: *vm.GpuDevices[0].Name}})) + }) + + It("should add GPU devices to VM when the VM without GPU devices", func() { + host := fake.NewTowerHost() + vm := fake.NewTowerVMFromElfMachine(elfMachine) + vm.Host = &models.NestedHost{ID: host.ID} + vm.Status = models.NewVMStatus(models.VMStatusSTOPPED) + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(nil, unexpectedError) + + machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + ok, err := reconciler.reconcileGPUDevices(machineContext, vm) + Expect(err).To(HaveOccurred()) + Expect(ok).To(BeFalse()) + expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.WaitingForAvailableHostWithEnoughGPUsReason}}) + }) + + It("should remove GPU devices to VM when detect host are not sufficient", func() { + host := fake.NewTowerHost() + vm := fake.NewTowerVMFromElfMachine(elfMachine) + vm.Host = &models.NestedHost{ID: host.ID} + vm.Status = models.NewVMStatus(models.VMStatusSTOPPED) + vm.GpuDevices = []*models.NestedGpuDevice{{ID: service.TowerString(fake.ID()), Name: service.TowerString(gpuModel)}} + conditions.MarkFalse(elfMachine, infrav1.VMProvisionedCondition, infrav1.TaskFailureReason, clusterv1.ConditionSeverityInfo, service.GPUAssignFailed) + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + mockVMService.EXPECT().RemoveGPUDevices(elfMachine.Status.VMRef, gomock.Len(1)).Return(nil, unexpectedError) + + machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + ok, err := reconciler.reconcileGPUDevices(machineContext, vm) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring(unexpectedError.Error())) + Expect(ok).To(BeFalse()) + Expect(logBuffer.String()).To(ContainSubstring("GPU devices of the host are not sufficient and the virtual machine cannot be started")) + }) + + It("should check if GPU devices can be used for VM", func() { + host := fake.NewTowerHost() + gpu := fake.NewTowerGPU() + gpu.Host = &models.NestedHost{ID: host.ID} + gpu.Model = service.TowerString(gpuModel) + gpu.Vms = []*models.NestedVM{{ID: service.TowerString("id"), Name: service.TowerString("vm")}} + vm := fake.NewTowerVMFromElfMachine(elfMachine) + vm.Host = &models.NestedHost{ID: host.ID} + vm.Status = models.NewVMStatus(models.VMStatusSTOPPED) + vm.GpuDevices = []*models.NestedGpuDevice{{ID: gpu.ID, Name: gpu.Model}} + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + mockVMService.EXPECT().FindGPUDevicesByIDs([]string{*gpu.ID}).Times(2).Return([]*models.GpuDevice{gpu}, nil) + mockVMService.EXPECT().RemoveGPUDevices(elfMachine.Status.VMRef, gomock.Len(1)).Return(nil, unexpectedError) + + machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + ok, err := reconciler.reconcileGPUDevices(machineContext, vm) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring(unexpectedError.Error())) + Expect(ok).To(BeFalse()) + Expect(logBuffer.String()).To(ContainSubstring("GPU devices of VM are already in use, so remove and reallocate")) + + gpu.Vms = []*models.NestedVM{{ID: vm.ID, Name: vm.Name}} + ok, err = reconciler.reconcileGPUDevices(machineContext, vm) + Expect(err).NotTo(HaveOccurred()) + Expect(ok).To(BeTrue()) + }) + }) + + Context("addGPUDevicesForVM", func() { + BeforeEach(func() { + elfMachine.Spec.GPUDevices = append(elfMachine.Spec.GPUDevices, infrav1.GPUPassthroughDeviceSpec{Model: gpuModel, Count: 1}) + }) + + It("should migrate VM when current host does not have enough GPU devices", func() { + host := fake.NewTowerHost() + vm := fake.NewTowerVMFromElfMachine(elfMachine) + vm.Host = &models.NestedHost{ID: service.TowerString(fake.ID())} + elfMachine.Status.VMRef = *vm.LocalID + gpu := fake.NewTowerGPU() + gpu.Host = &models.NestedHost{ID: host.ID} + gpu.Model = service.TowerString(gpuModel) + task := fake.NewTowerTask() + withTaskVM := fake.NewWithTaskVM(vm, task) + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Times(2).Return(service.NewHosts(host), nil) + mockVMService.EXPECT().FindGPUDevicesByHostIDs([]string{*host.ID}).Times(2).Return([]*models.GpuDevice{gpu}, nil) + mockVMService.EXPECT().Migrate(*vm.ID, *host.ID).Return(withTaskVM, nil) + + machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + ok, err := reconciler.addGPUDevicesForVM(machineContext, vm) + Expect(err).NotTo(HaveOccurred()) + Expect(ok).To(BeFalse()) + Expect(elfMachine.Status.TaskRef).To(Equal(*task.ID)) + Expect(logBuffer.String()).To(ContainSubstring("The current host does not have enough GPU devices")) + + elfMachine.Status.TaskRef = "" + unlockGPUDevicesLockedByVM(elfCluster.Spec.Cluster, elfMachine.Name) + mockVMService.EXPECT().Migrate(*vm.ID, *host.ID).Return(nil, unexpectedError) + ok, err = reconciler.addGPUDevicesForVM(machineContext, vm) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring(unexpectedError.Error())) + Expect(ok).To(BeFalse()) + Expect(elfMachine.Status.TaskRef).To(BeEmpty()) + }) + + It("should add GPU devices to VM", func() { + host := fake.NewTowerHost() + vm := fake.NewTowerVMFromElfMachine(elfMachine) + vm.Host = &models.NestedHost{ID: host.ID} + elfMachine.Status.VMRef = *vm.LocalID + gpu := fake.NewTowerGPU() + gpu.Host = &models.NestedHost{ID: host.ID} + gpu.Model = service.TowerString(gpuModel) + task := fake.NewTowerTask() + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Times(2).Return(service.NewHosts(host), nil) + mockVMService.EXPECT().FindGPUDevicesByHostIDs([]string{*host.ID}).Times(2).Return([]*models.GpuDevice{gpu}, nil) + mockVMService.EXPECT().AddGPUDevices(elfMachine.Status.VMRef, gomock.Any()).Return(task, nil) + + machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + ok, err := reconciler.addGPUDevicesForVM(machineContext, vm) + Expect(err).NotTo(HaveOccurred()) + Expect(ok).To(BeFalse()) + Expect(elfMachine.Status.TaskRef).To(Equal(*task.ID)) + expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.UpdatingReason}}) + + elfMachine.Status.TaskRef = "" + unlockGPUDevicesLockedByVM(elfCluster.Spec.Cluster, elfMachine.Name) + mockVMService.EXPECT().AddGPUDevices(elfMachine.Status.VMRef, gomock.Any()).Return(task, unexpectedError) + ok, err = reconciler.addGPUDevicesForVM(machineContext, vm) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring(unexpectedError.Error())) + Expect(ok).To(BeFalse()) + expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.AttachingGPUFailedReason}}) + Expect(elfMachine.Status.TaskRef).To(BeEmpty()) + }) + }) + + Context("removeVMGPUDevices", func() { + It("should remove GPU devices of VM", func() { + vm := fake.NewTowerVMFromElfMachine(elfMachine) + elfMachine.Status.VMRef = *vm.LocalID + vm.GpuDevices = []*models.NestedGpuDevice{{ID: service.TowerString(fake.ID()), Name: service.TowerString("A16")}} + task := fake.NewTowerTask() + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + mockVMService.EXPECT().RemoveGPUDevices(elfMachine.Status.VMRef, gomock.Len(1)).Return(nil, unexpectedError) + + machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + err := reconciler.removeVMGPUDevices(machineContext, vm) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring(unexpectedError.Error())) + expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.DetachingGPUFailedReason}}) + Expect(elfMachine.Status.TaskRef).To(BeEmpty()) + + mockVMService.EXPECT().RemoveGPUDevices(elfMachine.Status.VMRef, gomock.Len(1)).Return(task, nil) + err = reconciler.removeVMGPUDevices(machineContext, vm) + Expect(err).NotTo(HaveOccurred()) + expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.UpdatingReason}}) + Expect(elfMachine.Status.TaskRef).To(Equal(*task.ID)) + }) + }) +}) diff --git a/controllers/vm_limiter.go b/controllers/vm_limiter.go index 84324908..2aa67df5 100644 --- a/controllers/vm_limiter.go +++ b/controllers/vm_limiter.go @@ -22,6 +22,7 @@ import ( "time" "github.com/patrickmn/go-cache" + "k8s.io/apimachinery/pkg/util/sets" "github.com/smartxworks/cluster-api-provider-elf/pkg/config" ) @@ -138,3 +139,106 @@ func getKeyForVM(name string) string { func getKeyForVMDuplicate(name string) string { return fmt.Sprintf("vm:duplicate:%s", name) } + +/* GPU */ + +type lockedVMGPUs struct { + HostID string `json:"hostId"` + GPUDeviceIDs []string `json:"gpuDeviceIds"` + LockedAt time.Time `json:"lockedAt"` +} + +type lockedClusterGPUMap map[string]lockedVMGPUs + +const gpuLockTimeout = time.Minute * 8 + +var gpuLock sync.Mutex +var lockedGPUMap = make(map[string]lockedClusterGPUMap) + +// lockGPUDevicesForVM locks the GPU devices required to create or start a virtual machine. +// The GPU devices will be unlocked when the task is completed or times out. +// This prevents multiple virtual machines from being allocated the same GPU. +func lockGPUDevicesForVM(clusterID, vmName, hostID string, gpuDeviceIDs []string) bool { + gpuLock.Lock() + defer gpuLock.Unlock() + + lockedClusterGPUIDs := getLockedClusterGPUIDsWithoutLock(clusterID) + for i := 0; i < len(gpuDeviceIDs); i++ { + if lockedClusterGPUIDs.Has(gpuDeviceIDs[i]) { + return false + } + } + + lockedClusterGPUs := getLockedClusterGPUs(clusterID) + lockedClusterGPUs[vmName] = lockedVMGPUs{ + HostID: hostID, + GPUDeviceIDs: gpuDeviceIDs, + LockedAt: time.Now(), + } + + lockedGPUMap[clusterID] = lockedClusterGPUs + + return true +} + +// getLockedClusterGPUIDs returns the locked GPU devices of the specified cluster. +func getLockedClusterGPUIDs(clusterID string) sets.Set[string] { + gpuLock.Lock() + defer gpuLock.Unlock() + + return getLockedClusterGPUIDsWithoutLock(clusterID) +} + +func getGPUDevicesLockedByVM(clusterID, vmName string) *lockedVMGPUs { + gpuLock.Lock() + defer gpuLock.Unlock() + + lockedClusterGPUs := getLockedClusterGPUs(clusterID) + if vmGPUs, ok := lockedClusterGPUs[vmName]; ok { + if time.Now().Before(vmGPUs.LockedAt.Add(gpuLockTimeout)) { + return &vmGPUs + } + + delete(lockedClusterGPUs, vmName) + } + + return nil +} + +// unlockGPUDevicesLockedByVM unlocks the GPU devices locked by the virtual machine. +func unlockGPUDevicesLockedByVM(clusterID, vmName string) { + gpuLock.Lock() + defer gpuLock.Unlock() + + lockedClusterGPUs := getLockedClusterGPUs(clusterID) + delete(lockedClusterGPUs, vmName) + + if len(lockedClusterGPUs) == 0 { + delete(lockedGPUMap, clusterID) + } else { + lockedGPUMap[clusterID] = lockedClusterGPUs + } +} + +func getLockedClusterGPUs(clusterID string) lockedClusterGPUMap { + if _, ok := lockedGPUMap[clusterID]; ok { + return lockedGPUMap[clusterID] + } + + return make(map[string]lockedVMGPUs) +} + +func getLockedClusterGPUIDsWithoutLock(clusterID string) sets.Set[string] { + gpuIDs := sets.Set[string]{} + + lockedClusterGPUs := getLockedClusterGPUs(clusterID) + for vmName, lockedGPUs := range lockedClusterGPUs { + if time.Now().Before(lockedGPUs.LockedAt.Add(gpuLockTimeout)) { + gpuIDs.Insert(lockedGPUs.GPUDeviceIDs...) + } else { + delete(lockedClusterGPUs, vmName) + } + } + + return gpuIDs +} diff --git a/controllers/vm_limiter_test.go b/controllers/vm_limiter_test.go index eb71dd88..53e529db 100644 --- a/controllers/vm_limiter_test.go +++ b/controllers/vm_limiter_test.go @@ -18,6 +18,7 @@ package controllers import ( "fmt" + "time" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -125,6 +126,61 @@ var _ = Describe("Placement Group Operation Limiter", func() { }) }) +var _ = Describe("Lock GPU devices for VM", func() { + var clusterID, vmName, hostID, gpuID string + + BeforeEach(func() { + clusterID = fake.UUID() + vmName = fake.UUID() + hostID = fake.UUID() + gpuID = fake.UUID() + }) + + It("lockGPUDevicesForVM", func() { + gpuIDs := []string{gpuID} + + lockedVMGPUs := getGPUDevicesLockedByVM(clusterID, vmName) + Expect(lockedVMGPUs).To(BeNil()) + lockedClusterGPUIDs := getLockedClusterGPUIDs(clusterID) + Expect(lockedClusterGPUIDs.Len()).To(Equal(0)) + + Expect(lockGPUDevicesForVM(clusterID, vmName, hostID, gpuIDs)).To(BeTrue()) + lockedVMGPUs = getGPUDevicesLockedByVM(clusterID, vmName) + Expect(lockedVMGPUs.HostID).To(Equal(hostID)) + Expect(lockedVMGPUs.GPUDeviceIDs).To(Equal(gpuIDs)) + Expect(lockedVMGPUs.LockedAt.Unix()).To(Equal(time.Now().Unix())) + lockedClusterGPUIDs = getLockedClusterGPUIDs(clusterID) + Expect(lockedClusterGPUIDs.Len()).To(Equal(1)) + Expect(lockedClusterGPUIDs.Has(gpuID)).To(BeTrue()) + + Expect(lockGPUDevicesForVM(clusterID, vmName, hostID, gpuIDs)).To(BeFalse()) + + unlockGPUDevicesLockedByVM(clusterID, vmName) + lockedVMGPUs = getGPUDevicesLockedByVM(clusterID, vmName) + Expect(lockedVMGPUs).To(BeNil()) + lockedClusterGPUIDs = getLockedClusterGPUIDs(clusterID) + Expect(lockedClusterGPUIDs.Len()).To(Equal(0)) + + Expect(lockGPUDevicesForVM(clusterID, vmName, hostID, gpuIDs)).To(BeTrue()) + vmGPUs := lockedGPUMap[clusterID][vmName] + vmGPUs.LockedAt = vmGPUs.LockedAt.Add(-gpuLockTimeout) + lockedGPUMap[clusterID][vmName] = vmGPUs + lockedVMGPUs = getGPUDevicesLockedByVM(clusterID, vmName) + Expect(lockedVMGPUs).To(BeNil()) + lockedClusterGPUIDs = getLockedClusterGPUIDs(clusterID) + Expect(lockedClusterGPUIDs.Len()).To(Equal(0)) + + Expect(lockGPUDevicesForVM(clusterID, vmName, hostID, gpuIDs)).To(BeTrue()) + vmGPUs = lockedGPUMap[clusterID][vmName] + vmGPUs.LockedAt = vmGPUs.LockedAt.Add(-gpuLockTimeout) + lockedGPUMap[clusterID][vmName] = vmGPUs + lockedClusterGPUIDs = getLockedClusterGPUIDs(clusterID) + Expect(lockedClusterGPUIDs.Len()).To(Equal(0)) + lockedVMGPUs = getGPUDevicesLockedByVM(clusterID, vmName) + Expect(lockedVMGPUs).To(BeNil()) + }) +}) + func resetVMConcurrentCache() { vmConcurrentCache.Flush() } diff --git a/go.mod b/go.mod index 50e69cb5..fd532b10 100644 --- a/go.mod +++ b/go.mod @@ -6,12 +6,12 @@ require ( github.com/go-logr/logr v1.2.4 github.com/golang/mock v1.6.0 github.com/google/uuid v1.3.0 - github.com/onsi/ginkgo/v2 v2.11.0 - github.com/onsi/gomega v1.27.8 + github.com/onsi/ginkgo/v2 v2.12.0 + github.com/onsi/gomega v1.27.10 github.com/patrickmn/go-cache v2.1.0+incompatible github.com/pkg/errors v0.9.1 - github.com/smartxworks/cloudtower-go-sdk/v2 v2.0.0-sks1.1-rc.2 - golang.org/x/mod v0.10.0 + github.com/smartxworks/cloudtower-go-sdk/v2 v2.11.1-rc-2023-09-14 + golang.org/x/mod v0.12.0 k8s.io/api v0.27.2 k8s.io/apiextensions-apiserver v0.27.2 k8s.io/apimachinery v0.27.2 @@ -33,10 +33,10 @@ require ( github.com/google/go-github/v48 v48.2.0 // indirect github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect github.com/google/safetext v0.0.0-20220905092116-b49f7bc46da2 // indirect - go.opentelemetry.io/otel v1.16.0 // indirect - go.opentelemetry.io/otel/metric v1.16.0 // indirect - go.opentelemetry.io/otel/trace v1.16.0 // indirect - golang.org/x/tools v0.9.3 // indirect + go.opentelemetry.io/otel v1.17.0 // indirect + go.opentelemetry.io/otel/metric v1.17.0 // indirect + go.opentelemetry.io/otel/trace v1.17.0 // indirect + golang.org/x/tools v0.12.0 // indirect ) require ( @@ -65,14 +65,14 @@ require ( github.com/evanphx/json-patch/v5 v5.6.0 // indirect github.com/fsnotify/fsnotify v1.6.0 // indirect github.com/go-openapi/analysis v0.21.4 // indirect - github.com/go-openapi/errors v0.20.3 // indirect - github.com/go-openapi/jsonpointer v0.19.6 // indirect + github.com/go-openapi/errors v0.20.4 // indirect + github.com/go-openapi/jsonpointer v0.20.0 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/loads v0.21.2 // indirect github.com/go-openapi/runtime v0.26.0 github.com/go-openapi/spec v0.20.9 // indirect github.com/go-openapi/strfmt v0.21.7 - github.com/go-openapi/swag v0.22.3 // indirect + github.com/go-openapi/swag v0.22.4 // indirect github.com/go-openapi/validate v0.22.1 // indirect github.com/gobuffalo/flect v1.0.2 // indirect github.com/gogo/protobuf v1.3.2 // indirect @@ -119,13 +119,13 @@ require ( github.com/stoewer/go-strcase v1.2.0 // indirect github.com/subosito/gotenv v1.4.2 // indirect github.com/valyala/fastjson v1.6.4 // indirect - go.mongodb.org/mongo-driver v1.11.6 // indirect - golang.org/x/crypto v0.11.0 // indirect - golang.org/x/net v0.13.0 // indirect + go.mongodb.org/mongo-driver v1.12.1 // indirect + golang.org/x/crypto v0.12.0 // indirect + golang.org/x/net v0.14.0 // indirect golang.org/x/oauth2 v0.10.0 // indirect - golang.org/x/sys v0.10.0 // indirect - golang.org/x/term v0.10.0 // indirect - golang.org/x/text v0.11.0 // indirect + golang.org/x/sys v0.11.0 // indirect + golang.org/x/term v0.11.0 // indirect + golang.org/x/text v0.12.0 // indirect golang.org/x/time v0.3.0 // indirect gomodules.xyz/jsonpatch/v2 v2.3.0 // indirect google.golang.org/appengine v1.6.7 // indirect diff --git a/go.sum b/go.sum index 1aac64ef..f128f0b5 100644 --- a/go.sum +++ b/go.sum @@ -166,12 +166,14 @@ github.com/go-openapi/analysis v0.21.4/go.mod h1:4zQ35W4neeZTqh3ol0rv/O8JBbka9Qy github.com/go-openapi/errors v0.19.8/go.mod h1:cM//ZKUKyO06HSwqAelJ5NsEMMcpa6VpXe8DOa1Mi1M= github.com/go-openapi/errors v0.19.9/go.mod h1:cM//ZKUKyO06HSwqAelJ5NsEMMcpa6VpXe8DOa1Mi1M= github.com/go-openapi/errors v0.20.2/go.mod h1:cM//ZKUKyO06HSwqAelJ5NsEMMcpa6VpXe8DOa1Mi1M= -github.com/go-openapi/errors v0.20.3 h1:rz6kiC84sqNQoqrtulzaL/VERgkoCyB6WdEkc2ujzUc= github.com/go-openapi/errors v0.20.3/go.mod h1:Z3FlZ4I8jEGxjUK+bugx3on2mIAk4txuAOhlsB1FSgk= +github.com/go-openapi/errors v0.20.4 h1:unTcVm6PispJsMECE3zWgvG4xTiKda1LIR5rCRWLG6M= +github.com/go-openapi/errors v0.20.4/go.mod h1:Z3FlZ4I8jEGxjUK+bugx3on2mIAk4txuAOhlsB1FSgk= github.com/go-openapi/jsonpointer v0.19.3/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= github.com/go-openapi/jsonpointer v0.19.5/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= -github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE= github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= +github.com/go-openapi/jsonpointer v0.20.0 h1:ESKJdU9ASRfaPNOPRx12IUyA1vn3R9GiE3KYD14BXdQ= +github.com/go-openapi/jsonpointer v0.20.0/go.mod h1:6PGzBjjIIumbLYysB73Klnms1mwnU4G3YHOECG3CedA= github.com/go-openapi/jsonreference v0.19.6/go.mod h1:diGHMEHg2IqXZGKxqyvWdfWU/aim5Dprw5bqpKkTvns= github.com/go-openapi/jsonreference v0.20.0/go.mod h1:Ag74Ico3lPc+zR+qjn4XBUmXymS4zJbYVCZmcgkasdo= github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= @@ -194,8 +196,9 @@ github.com/go-openapi/strfmt v0.21.7/go.mod h1:adeGTkxE44sPyLk0JV235VQAO/ZXUr8KA github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk= github.com/go-openapi/swag v0.19.15/go.mod h1:QYRuS/SOXUCsnplDa677K7+DxSOj6IPNl/eQntq43wQ= github.com/go-openapi/swag v0.21.1/go.mod h1:QYRuS/SOXUCsnplDa677K7+DxSOj6IPNl/eQntq43wQ= -github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/g= github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-openapi/swag v0.22.4 h1:QLMzNJnMGPRNDCbySlcj1x01tzU8/9LTTL9hZZZogBU= +github.com/go-openapi/swag v0.22.4/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= github.com/go-openapi/validate v0.22.1 h1:G+c2ub6q47kfX1sOBLwIQwzBVt8qmOAARyo/9Fqs9NU= github.com/go-openapi/validate v0.22.1/go.mod h1:rjnrwK57VJ7A8xqfpAOEKRH8yQSGUriMu5/zuPSQ1hg= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= @@ -383,6 +386,7 @@ github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORN github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -457,8 +461,10 @@ github.com/onsi/ginkgo/v2 v2.9.0/go.mod h1:4xkjoL/tZv4SMWeww56BU5kAt19mVB47gTWxm github.com/onsi/ginkgo/v2 v2.9.1/go.mod h1:FEcmzVcCHl+4o9bQZVab+4dC9+j+91t2FHSzmGAPfuo= github.com/onsi/ginkgo/v2 v2.9.2/go.mod h1:WHcJJG2dIlcCqVfBAwUCrJxSPFb6v4azBwgxeMeDuts= github.com/onsi/ginkgo/v2 v2.9.5/go.mod h1:tvAoo1QUJwNEU2ITftXTpR7R1RbCzoZUOs3RonqW57k= -github.com/onsi/ginkgo/v2 v2.11.0 h1:WgqUCUt/lT6yXoQ8Wef0fsNn5cAuMK7+KT9UFRz2tcU= +github.com/onsi/ginkgo/v2 v2.9.7/go.mod h1:cxrmXWykAwTwhQsJOPfdIDiJ+l2RYq7U8hFU+M/1uw0= github.com/onsi/ginkgo/v2 v2.11.0/go.mod h1:ZhrRA5XmEE3x3rhlzamx/JJvujdZoJ2uvgI7kR0iZvM= +github.com/onsi/ginkgo/v2 v2.12.0 h1:UIVDowFPwpg6yMUpPjGkYvf06K3RAiJXUhCxEwQVHRI= +github.com/onsi/ginkgo/v2 v2.12.0/go.mod h1:ZNEzXISYlqpb8S36iN71ifqLi3vVD1rVJGvWRCJOUpQ= github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= github.com/onsi/gomega v1.17.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAlGdZY= @@ -474,8 +480,9 @@ github.com/onsi/gomega v1.27.3/go.mod h1:5vG284IBtfDAmDyrK+eGyZmUgUlmi+Wngqo557c github.com/onsi/gomega v1.27.4/go.mod h1:riYq/GJKh8hhoM01HN6Vmuy93AarCXCBGpvFDK3q3fQ= github.com/onsi/gomega v1.27.6/go.mod h1:PIQNjfQwkP3aQAH7lf7j87O/5FiNr+ZR8+ipb+qQlhg= github.com/onsi/gomega v1.27.7/go.mod h1:1p8OOlwo2iUUDsHnOrjE5UKYJ+e3W8eQ3qSlRahPmr4= -github.com/onsi/gomega v1.27.8 h1:gegWiwZjBsf2DgiSbf5hpokZ98JVDMcWkUiigk6/KXc= github.com/onsi/gomega v1.27.8/go.mod h1:2J8vzI/s+2shY9XHRApDkdgPo1TKT7P2u6fXeJKFnNQ= +github.com/onsi/gomega v1.27.10 h1:naR28SdDFlqrG6kScpT8VWpu1xWY5nJRCF3XaYyBjhI= +github.com/onsi/gomega v1.27.10/go.mod h1:RsS8tutOdbdgzbPtzzATp12yT7kM5I5aElG3evPbQ0M= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.0.2 h1:9yCKha/T5XdGtO0q9Q9a6T5NUCsTn/DrBg0D7ufOcFM= @@ -493,6 +500,7 @@ github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3v github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c= github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ= github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4= +github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= @@ -524,6 +532,7 @@ github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6L github.com/rogpeppe/go-internal v1.1.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.2.2/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= +github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= @@ -538,8 +547,8 @@ github.com/sirupsen/logrus v1.4.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPx github.com/sirupsen/logrus v1.4.1/go.mod h1:ni0Sbl8bgC9z8RoU9G6nDWqqs/fq4eDPysMBDgk/93Q= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= -github.com/smartxworks/cloudtower-go-sdk/v2 v2.0.0-sks1.1-rc.2 h1:dg91utWrMm2M99uSPYz3U1N7B4sReZyY8g67BHej28I= -github.com/smartxworks/cloudtower-go-sdk/v2 v2.0.0-sks1.1-rc.2/go.mod h1:a22xOjZMHOeeOAFWOnlOQl26bpumS+thkl2UAdfKmAc= +github.com/smartxworks/cloudtower-go-sdk/v2 v2.11.1-rc-2023-09-14 h1:CHJLqIwjPHMKpnlR7wXmKUr9n2Ba7KhR5LG63S4TqQY= +github.com/smartxworks/cloudtower-go-sdk/v2 v2.11.1-rc-2023-09-14/go.mod h1:X6R9+L438SMnLJXykSCV3fJ+AZul0hlyjITsZgrSRtM= github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= github.com/soheilhy/cmux v0.1.4/go.mod h1:IM3LyeVVIOuxMH7sFAkER9+bJ4dT7Ms6E4xg4kGIyLM= @@ -581,13 +590,13 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY= github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw= github.com/subosito/gotenv v1.4.2 h1:X1TuBLAMDFbaTAChgCBLu3DU3UPyELpnF2jjJ2cz/S8= github.com/subosito/gotenv v1.4.2/go.mod h1:ayKnFf/c6rvx/2iiLrJUk1e6plDbT3edrFNGqEflhK0= github.com/thoas/go-funk v0.9.3/go.mod h1:+IWnUfUmFO1+WVYQWQtIJHeRRdaIyyYglZN7xzUPe4Q= -github.com/tidwall/pretty v1.0.0 h1:HsD+QiTn7sK6flMKIvNmpqz1qrpP3Ps6jOKIKMooyg4= github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk= github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/valyala/fastjson v1.6.4 h1:uAUNq9Z6ymTgGhcm0UynUAB6tlbakBrz6CQFax3BXVQ= @@ -595,8 +604,10 @@ github.com/valyala/fastjson v1.6.4/go.mod h1:CLCAqky6SMuOcxStkYQvblddUtoRxhYMGLr github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= github.com/xdg-go/scram v1.0.2/go.mod h1:1WAq6h33pAW+iRreB34OORO2Nf7qel3VV3fjBj+hCSs= github.com/xdg-go/scram v1.1.1/go.mod h1:RaEWvsqvNKKvBPvcKeFjrG2cJqOkHTiyTpzz23ni57g= +github.com/xdg-go/scram v1.1.2/go.mod h1:RT/sEzTbU5y00aCK8UOx6R7YryM0iF1N2MOmC3kKLN4= github.com/xdg-go/stringprep v1.0.2/go.mod h1:8F9zXuvzgwmyT5DUm4GUfZGDdT3W+LCvS6+da4O5kxM= github.com/xdg-go/stringprep v1.0.3/go.mod h1:W3f5j4i+9rC0kuIEJL0ky1VpHXQU3ocBgklLGvcBnW8= +github.com/xdg-go/stringprep v1.0.4/go.mod h1:mPGuuIYwz7CmR2bT9j4GbQqutWS1zV24gijq1dTyGkM= github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU= github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ= github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y= @@ -614,8 +625,8 @@ go.mongodb.org/mongo-driver v1.7.3/go.mod h1:NqaYOwnXWr5Pm7AOpO5QFxKJ503nbMse/R7 go.mongodb.org/mongo-driver v1.7.5/go.mod h1:VXEWRZ6URJIkUq2SCAyapmhH0ZLRBP+FT4xhp5Zvxng= go.mongodb.org/mongo-driver v1.10.0/go.mod h1:wsihk0Kdgv8Kqu1Anit4sfK+22vSFbUrAVEYRhCXrA8= go.mongodb.org/mongo-driver v1.11.3/go.mod h1:PTSz5yu21bkT/wXpkS7WR5f0ddqw5quethTUn9WM+2g= -go.mongodb.org/mongo-driver v1.11.6 h1:XM7G6PjiGAO5betLF13BIa5TlLUUE3uJ/2Ox3Lz1K+o= -go.mongodb.org/mongo-driver v1.11.6/go.mod h1:G9TgswdsWjX4tmDA5zfs2+6AEPpYJwqblyjsfuh8oXY= +go.mongodb.org/mongo-driver v1.12.1 h1:nLkghSU8fQNaK7oUmDhQFsnrtcoNy7Z6LVFKsEecqgE= +go.mongodb.org/mongo-driver v1.12.1/go.mod h1:/rGBTebI3XYboVmgz+Wv3Bcbl3aD0QF9zl6kDDw18rQ= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= @@ -623,15 +634,15 @@ go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.5/go.mod h1:5pWMHQbX5EPX2/62yrJeAkowc+lfs/XD7Uxpq3pI6kk= go.opentelemetry.io/otel v1.14.0/go.mod h1:o4buv+dJzx8rohcUeRmWUZhqupFvzWis188WlggnNeU= -go.opentelemetry.io/otel v1.16.0 h1:Z7GVAX/UkAXPKsy94IU+i6thsQS4nb7LviLpnaNeW8s= -go.opentelemetry.io/otel v1.16.0/go.mod h1:vl0h9NUa1D5s1nv3A5vZOYWn8av4K8Ml6JDeHrT/bx4= -go.opentelemetry.io/otel/metric v1.16.0 h1:RbrpwVG1Hfv85LgnZ7+txXioPDoh6EdbZHo26Q3hqOo= -go.opentelemetry.io/otel/metric v1.16.0/go.mod h1:QE47cpOmkwipPiefDwo2wDzwJrlfxxNYodqc4xnGCo4= +go.opentelemetry.io/otel v1.17.0 h1:MW+phZ6WZ5/uk2nd93ANk/6yJ+dVrvNWUjGhnnFU5jM= +go.opentelemetry.io/otel v1.17.0/go.mod h1:I2vmBGtFaODIVMBSTPVDlJSzBDNf93k60E6Ft0nyjo0= +go.opentelemetry.io/otel/metric v1.17.0 h1:iG6LGVz5Gh+IuO0jmgvpTB6YVrCGngi8QGm+pMd8Pdc= +go.opentelemetry.io/otel/metric v1.17.0/go.mod h1:h4skoxdZI17AxwITdmdZjjYJQH5nzijUUjm+wtPph5o= go.opentelemetry.io/otel/sdk v1.14.0 h1:PDCppFRDq8A1jL9v6KMI6dYesaq+DFcDZvjsoGvxGzY= go.opentelemetry.io/otel/sdk v1.14.0/go.mod h1:bwIC5TjrNG6QDCHNWvW4HLHtUQ4I+VQDsnjhvyZCALM= go.opentelemetry.io/otel/trace v1.14.0/go.mod h1:8avnQLK+CG77yNLUae4ea2JDQ6iT+gozhnZjy/rw9G8= -go.opentelemetry.io/otel/trace v1.16.0 h1:8JRpaObFoW0pxuVPapkgH8UhHQj+bJW8jJsCZEu5MQs= -go.opentelemetry.io/otel/trace v1.16.0/go.mod h1:Yt9vYq1SdNz3xdjZZK7wcXv1qv2pwLkqr2QVwea0ef0= +go.opentelemetry.io/otel/trace v1.17.0 h1:/SWhSRHmDPOImIAetP1QAeMnZYiQXrTy4fMMYOdSKWQ= +go.opentelemetry.io/otel/trace v1.17.0/go.mod h1:I/4vKTgFclIsXRVucpH25X0mpFSczM7aHeaz0ZBLWjY= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= @@ -655,8 +666,9 @@ golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0 golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.1.0/go.mod h1:RecgLatLF4+eUMCP1PoPZQb+cVrJcOPbHkTkbkB9sbw= golang.org/x/crypto v0.3.0/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= -golang.org/x/crypto v0.11.0 h1:6Ewdq3tDic1mg5xRO4milcWCfMVQhI4NkqWWvqejpuA= golang.org/x/crypto v0.11.0/go.mod h1:xgJhtzW8F9jGdVFWZESrid1U1bjeNy4zgy5cRr/CIio= +golang.org/x/crypto v0.12.0 h1:tFM/ta59kqch6LlvYnPa0yx5a83cL2nHflFhYKvv9Yk= +golang.org/x/crypto v0.12.0/go.mod h1:NF0Gs7EO5K4qLn+Ylc+fih8BSTeIjAP05siRnAh98yw= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -697,8 +709,9 @@ golang.org/x/mod v0.6.0/go.mod h1:4mET923SAdbXp2ki8ey+zGs1SLqsuM2Y0uvdZR/fUNI= golang.org/x/mod v0.7.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.9.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/mod v0.10.0 h1:lFO9qtOdlre5W1jxS3r/4szv2/6iXxScdzjoBMXNhYk= golang.org/x/mod v0.10.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.12.0 h1:rmsUpXtvNzj340zd98LZ4KntptpfRHwpFOHG188oHXc= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -753,8 +766,9 @@ golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/net v0.13.0 h1:Nvo8UFsZ8X3BhAC9699Z1j7XQ3rsZnUUm7jfBEk1ueY= -golang.org/x/net v0.13.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA= +golang.org/x/net v0.12.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA= +golang.org/x/net v0.14.0 h1:BONx9s002vGdD9umnlX1Po8vOZmrgH34qlHcD1MfK14= +golang.org/x/net v0.14.0/go.mod h1:PpSgVXXLK0OxS0F31C1/tv6XNguvCrnXIDrFMspZIUI= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -781,6 +795,7 @@ golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.2.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -852,8 +867,10 @@ golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.10.0 h1:SqMFp9UcQJZa+pmYuAKjd9xq1f0j5rLcDIk0mj4qAsA= +golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM= +golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= @@ -863,8 +880,9 @@ golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= -golang.org/x/term v0.10.0 h1:3R7pNqamzBraeqj/Tj8qt1aQ2HpmlC+Cx/qL/7hn4/c= golang.org/x/term v0.10.0/go.mod h1:lpqdcUyK/oCiQxvxVrppt5ggO2KCZ5QblwqPnfZ6d5o= +golang.org/x/term v0.11.0 h1:F9tnn/DA/Im8nCwm+fX+1/eBwi4qFjRT++MhtVC4ZX0= +golang.org/x/term v0.11.0/go.mod h1:zC9APTIj3jG3FdV/Ons+XE1riIZXG4aZ4GTHiPZJPIU= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -874,14 +892,16 @@ golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.11.0 h1:LAntKIrcmeSKERyiOh0XMV39LXS8IE9UL2yP7+f5ij4= golang.org/x/text v0.11.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.12.0 h1:k+n5B8goJNdU7hSvEtMUz3d1Q6D/XW4COJSJR6fN0mc= +golang.org/x/text v0.12.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -952,8 +972,9 @@ golang.org/x/tools v0.4.0/go.mod h1:UE5sM2OK9E/d67R0ANs2xJizIymRP5gJU295PvKXxjQ= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/tools v0.7.0/go.mod h1:4pg6aUX35JBAogB10C9AtvVL+qowtN4pT3CGSQex14s= golang.org/x/tools v0.9.1/go.mod h1:owI94Op576fPu3cIGQeHs3joujW/2Oc6MtlxbF5dfNc= -golang.org/x/tools v0.9.3 h1:Gn1I8+64MsuTb/HpH+LmQtNas23LhUVr3rYZ0eKuaMM= golang.org/x/tools v0.9.3/go.mod h1:owI94Op576fPu3cIGQeHs3joujW/2Oc6MtlxbF5dfNc= +golang.org/x/tools v0.12.0 h1:YW6HUoUmYBpwSgyaGaZq1fHjrBjX1rlpZ54T6mu2kss= +golang.org/x/tools v0.12.0/go.mod h1:Sc0INKfu04TlqNoRA1hgpFZbhYXHPr4V5DzpSBTPqQM= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/metadata.yaml b/metadata.yaml index ca040e02..55eda299 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -6,6 +6,9 @@ apiVersion: clusterctl.cluster.x-k8s.io/v1alpha3 kind: Metadata releaseSeries: + - major: 1 + minor: 3 + contract: v1beta1 - major: 1 minor: 2 contract: v1beta1 diff --git a/pkg/service/collections.go b/pkg/service/collections.go index ff35b4fc..bf9d7cef 100644 --- a/pkg/service/collections.go +++ b/pkg/service/collections.go @@ -159,3 +159,12 @@ func And(filters ...Func) Func { return true } } + +// IDs returns the IDs of all hosts. +func (s Hosts) IDs() []string { + res := make([]string, 0, len(s)) + for _, value := range s { + res = append(res, *value.ID) + } + return res +} diff --git a/pkg/service/collections_test.go b/pkg/service/collections_test.go index 3dc8ebc8..7fe4fa0f 100644 --- a/pkg/service/collections_test.go +++ b/pkg/service/collections_test.go @@ -41,6 +41,7 @@ func TestHostCollection(t *testing.T) { g.Expect(hosts.Get(*TowerString("404"))).To(gomega.BeNil()) g.Expect(hosts.Find(sets.Set[string]{}.Insert(*host1.ID)).Contains(*host1.ID)).To(gomega.BeTrue()) g.Expect(hosts.Find(sets.Set[string]{}.Insert(*host1.ID)).Len()).To(gomega.Equal(1)) + g.Expect(hosts.IDs()).To(gomega.ContainElements(*host1.ID, *host2.ID)) }) t.Run("Available", func(t *testing.T) { diff --git a/pkg/service/errors.go b/pkg/service/errors.go index 29dc0602..1592a9d3 100644 --- a/pkg/service/errors.go +++ b/pkg/service/errors.go @@ -40,6 +40,7 @@ const ( PlacementGroupMustError = "PlacementGroupMustFilter" PlacementGroupPriorError = "PlacementGroupPriorFilter" VMDuplicateError = "VM_DUPLICATED_NAME" + GPUAssignFailed = "GPU_ASSIGN_FAILED" ) func IsVMNotFound(err error) bool { @@ -58,6 +59,10 @@ func IsShutDownTimeout(message string) bool { return strings.Contains(message, "JOB_VM_SHUTDOWN_TIMEOUT") } +func IsGPUAssignFailed(message string) bool { + return strings.Contains(message, GPUAssignFailed) +} + func IsTaskNotFound(err error) bool { return strings.Contains(err.Error(), TaskNotFound) } @@ -91,6 +96,15 @@ func FormatCloudInitError(message string) string { return msg } +func ParseGPUAssignFailed(message string) string { + index := strings.LastIndex(message, GPUAssignFailed) + if index == -1 { + return message + } + + return message[index:] +} + func IsMemoryInsufficientError(message string) bool { return strings.Contains(message, MemoryInsufficientError) } diff --git a/pkg/service/mock_services/vm_mock.go b/pkg/service/mock_services/vm_mock.go index 9abbf3cd..7f401d3c 100644 --- a/pkg/service/mock_services/vm_mock.go +++ b/pkg/service/mock_services/vm_mock.go @@ -13,7 +13,6 @@ import ( models "github.com/smartxworks/cloudtower-go-sdk/v2/models" v1beta1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" service "github.com/smartxworks/cluster-api-provider-elf/pkg/service" - v1beta10 "sigs.k8s.io/cluster-api/api/v1beta1" ) // MockVMService is a mock of VMService interface. @@ -39,6 +38,21 @@ func (m *MockVMService) EXPECT() *MockVMServiceMockRecorder { return m.recorder } +// AddGPUDevices mocks base method. +func (m *MockVMService) AddGPUDevices(id string, gpus []*models.VMGpuOperationParams) (*models.Task, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "AddGPUDevices", id, gpus) + ret0, _ := ret[0].(*models.Task) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// AddGPUDevices indicates an expected call of AddGPUDevices. +func (mr *MockVMServiceMockRecorder) AddGPUDevices(id, gpus interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddGPUDevices", reflect.TypeOf((*MockVMService)(nil).AddGPUDevices), id, gpus) +} + // AddLabelsToVM mocks base method. func (m *MockVMService) AddLabelsToVM(vmID string, labels []string) (*models.Task, error) { m.ctrl.T.Helper() @@ -70,18 +84,18 @@ func (mr *MockVMServiceMockRecorder) AddVMsToPlacementGroup(placementGroup, vmID } // Clone mocks base method. -func (m *MockVMService) Clone(elfCluster *v1beta1.ElfCluster, machine *v1beta10.Machine, elfMachine *v1beta1.ElfMachine, bootstrapData, host string) (*models.WithTaskVM, error) { +func (m *MockVMService) Clone(elfCluster *v1beta1.ElfCluster, elfMachine *v1beta1.ElfMachine, bootstrapData, host string, machineGPUDevices []*models.GpuDevice) (*models.WithTaskVM, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "Clone", elfCluster, machine, elfMachine, bootstrapData, host) + ret := m.ctrl.Call(m, "Clone", elfCluster, elfMachine, bootstrapData, host, machineGPUDevices) ret0, _ := ret[0].(*models.WithTaskVM) ret1, _ := ret[1].(error) return ret0, ret1 } // Clone indicates an expected call of Clone. -func (mr *MockVMServiceMockRecorder) Clone(elfCluster, machine, elfMachine, bootstrapData, host interface{}) *gomock.Call { +func (mr *MockVMServiceMockRecorder) Clone(elfCluster, elfMachine, bootstrapData, host, machineGPUDevices interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Clone", reflect.TypeOf((*MockVMService)(nil).Clone), elfCluster, machine, elfMachine, bootstrapData, host) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Clone", reflect.TypeOf((*MockVMService)(nil).Clone), elfCluster, elfMachine, bootstrapData, host, machineGPUDevices) } // CreateVMPlacementGroup mocks base method. @@ -158,6 +172,36 @@ func (mr *MockVMServiceMockRecorder) FindByIDs(ids interface{}) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FindByIDs", reflect.TypeOf((*MockVMService)(nil).FindByIDs), ids) } +// FindGPUDevicesByHostIDs mocks base method. +func (m *MockVMService) FindGPUDevicesByHostIDs(hostIDs []string) ([]*models.GpuDevice, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "FindGPUDevicesByHostIDs", hostIDs) + ret0, _ := ret[0].([]*models.GpuDevice) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// FindGPUDevicesByHostIDs indicates an expected call of FindGPUDevicesByHostIDs. +func (mr *MockVMServiceMockRecorder) FindGPUDevicesByHostIDs(hostIDs interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FindGPUDevicesByHostIDs", reflect.TypeOf((*MockVMService)(nil).FindGPUDevicesByHostIDs), hostIDs) +} + +// FindGPUDevicesByIDs mocks base method. +func (m *MockVMService) FindGPUDevicesByIDs(gpuIDs []string) ([]*models.GpuDevice, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "FindGPUDevicesByIDs", gpuIDs) + ret0, _ := ret[0].([]*models.GpuDevice) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// FindGPUDevicesByIDs indicates an expected call of FindGPUDevicesByIDs. +func (mr *MockVMServiceMockRecorder) FindGPUDevicesByIDs(gpuIDs interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FindGPUDevicesByIDs", reflect.TypeOf((*MockVMService)(nil).FindGPUDevicesByIDs), gpuIDs) +} + // FindVMsByName mocks base method. func (m *MockVMService) FindVMsByName(name string) ([]*models.VM, error) { m.ctrl.T.Helper() @@ -368,6 +412,21 @@ func (mr *MockVMServiceMockRecorder) PowerOn(uuid interface{}) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "PowerOn", reflect.TypeOf((*MockVMService)(nil).PowerOn), uuid) } +// RemoveGPUDevices mocks base method. +func (m *MockVMService) RemoveGPUDevices(id string, gpus []*models.VMGpuOperationParams) (*models.Task, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "RemoveGPUDevices", id, gpus) + ret0, _ := ret[0].(*models.Task) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// RemoveGPUDevices indicates an expected call of RemoveGPUDevices. +func (mr *MockVMServiceMockRecorder) RemoveGPUDevices(id, gpus interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemoveGPUDevices", reflect.TypeOf((*MockVMService)(nil).RemoveGPUDevices), id, gpus) +} + // ShutDown mocks base method. func (m *MockVMService) ShutDown(uuid string) (*models.Task, error) { m.ctrl.T.Helper() diff --git a/pkg/service/util.go b/pkg/service/util.go index 0190deab..f6794562 100644 --- a/pkg/service/util.go +++ b/pkg/service/util.go @@ -189,6 +189,10 @@ func IsPowerOnVMTask(task *models.Task) bool { return strings.Contains(GetTowerString(task.Description), "Start VM") } +func IsUpdateVMTask(task *models.Task) bool { + return strings.Contains(GetTowerString(task.Description), "Edit VM") +} + func IsVMMigrationTask(task *models.Task) bool { return strings.Contains(GetTowerString(task.Description), "performing a live migration") } @@ -196,3 +200,25 @@ func IsVMMigrationTask(task *models.Task) bool { func IsPlacementGroupTask(task *models.Task) bool { return strings.Contains(GetTowerString(task.Description), "VM placement group") // Update VM placement group } + +// GPUCanBeUsedForVM returns whether the virtual machine can use the specified GPU. +func GPUCanBeUsedForVM(gpuDevice *models.GpuDevice, vm string) bool { + if len(gpuDevice.Vms) == 0 || + *gpuDevice.Vms[0].ID == vm || + *gpuDevice.Vms[0].Name == vm { + return true + } + + return false +} + +func FilterOutGPUsCanNotBeUsedForVM(gpuDevices []*models.GpuDevice, vm string) []*models.GpuDevice { + var gpus []*models.GpuDevice + for i := 0; i < len(gpuDevices); i++ { + if GPUCanBeUsedForVM(gpuDevices[i], vm) { + gpus = append(gpus, gpuDevices[i]) + } + } + + return gpus +} diff --git a/pkg/service/util_test.go b/pkg/service/util_test.go index 28253a6b..98e167e8 100644 --- a/pkg/service/util_test.go +++ b/pkg/service/util_test.go @@ -100,3 +100,27 @@ func TestIsAvailableHost(t *testing.T) { g.Expect(message).To(gomega.ContainSubstring("3")) }) } + +func TestGPUCanBeUsedForVM(t *testing.T) { + g := gomega.NewGomegaWithT(t) + + t.Run("should return false when GPU can not be used for VM", func(t *testing.T) { + g.Expect(GPUCanBeUsedForVM(&models.GpuDevice{Vms: []*models.NestedVM{{ID: TowerString("id2"), Name: TowerString("vm2")}, {ID: TowerString("id"), Name: TowerString("vm")}}}, "vm")).To(gomega.BeFalse()) + }) + + t.Run("should return false when GPU can not be used for VM", func(t *testing.T) { + g.Expect(GPUCanBeUsedForVM(&models.GpuDevice{}, "vm")).To(gomega.BeTrue()) + g.Expect(GPUCanBeUsedForVM(&models.GpuDevice{Vms: []*models.NestedVM{{ID: TowerString("vm")}}}, "vm")).To(gomega.BeTrue()) + g.Expect(GPUCanBeUsedForVM(&models.GpuDevice{Vms: []*models.NestedVM{{ID: TowerString("id"), Name: TowerString("vm")}}}, "vm")).To(gomega.BeTrue()) + }) +} + +func TestFilterOutGPUsCanNotBeUsedForVM(t *testing.T) { + g := gomega.NewGomegaWithT(t) + + t.Run("should filter GPUs", func(t *testing.T) { + g.Expect(FilterOutGPUsCanNotBeUsedForVM([]*models.GpuDevice{}, "vm")).To(gomega.BeEmpty()) + g.Expect(FilterOutGPUsCanNotBeUsedForVM([]*models.GpuDevice{{Vms: []*models.NestedVM{{ID: TowerString("id2"), Name: TowerString("vm2")}}}}, "vm")).To(gomega.BeEmpty()) + g.Expect(FilterOutGPUsCanNotBeUsedForVM([]*models.GpuDevice{{Vms: []*models.NestedVM{{ID: TowerString("id"), Name: TowerString("vm")}}}}, "vm")).To(gomega.HaveLen(1)) + }) +} diff --git a/pkg/service/vm.go b/pkg/service/vm.go index 93bb0003..b4dc91fe 100644 --- a/pkg/service/vm.go +++ b/pkg/service/vm.go @@ -24,6 +24,7 @@ import ( "github.com/pkg/errors" clientcluster "github.com/smartxworks/cloudtower-go-sdk/v2/client/cluster" clientvmtemplate "github.com/smartxworks/cloudtower-go-sdk/v2/client/content_library_vm_template" + clientgpu "github.com/smartxworks/cloudtower-go-sdk/v2/client/gpu_device" clienthost "github.com/smartxworks/cloudtower-go-sdk/v2/client/host" clientlabel "github.com/smartxworks/cloudtower-go-sdk/v2/client/label" clienttask "github.com/smartxworks/cloudtower-go-sdk/v2/client/task" @@ -33,7 +34,6 @@ import ( clientvmplacementgroup "github.com/smartxworks/cloudtower-go-sdk/v2/client/vm_placement_group" "github.com/smartxworks/cloudtower-go-sdk/v2/models" "k8s.io/apimachinery/pkg/util/wait" - clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" "github.com/smartxworks/cluster-api-provider-elf/pkg/config" @@ -41,16 +41,16 @@ import ( ) type VMService interface { - Clone(elfCluster *infrav1.ElfCluster, - machine *clusterv1.Machine, - elfMachine *infrav1.ElfMachine, - bootstrapData, host string) (*models.WithTaskVM, error) + Clone(elfCluster *infrav1.ElfCluster, elfMachine *infrav1.ElfMachine, bootstrapData, + host string, machineGPUDevices []*models.GpuDevice) (*models.WithTaskVM, error) UpdateVM(vm *models.VM, elfMachine *infrav1.ElfMachine) (*models.WithTaskVM, error) Migrate(vmID, hostID string) (*models.WithTaskVM, error) Delete(uuid string) (*models.Task, error) PowerOff(uuid string) (*models.Task, error) PowerOn(uuid string) (*models.Task, error) ShutDown(uuid string) (*models.Task, error) + RemoveGPUDevices(id string, gpus []*models.VMGpuOperationParams) (*models.Task, error) + AddGPUDevices(id string, gpus []*models.VMGpuOperationParams) (*models.Task, error) Get(id string) (*models.VM, error) GetByName(name string) (*models.VM, error) FindByIDs(ids []string) ([]*models.VM, error) @@ -70,6 +70,8 @@ type VMService interface { GetVMPlacementGroup(name string) (*models.VMPlacementGroup, error) AddVMsToPlacementGroup(placementGroup *models.VMPlacementGroup, vmIDs []string) (*models.Task, error) DeleteVMPlacementGroupsByName(ctx goctx.Context, placementGroupName string) error + FindGPUDevicesByHostIDs(hostIDs []string) ([]*models.GpuDevice, error) + FindGPUDevicesByIDs(gpuIDs []string) ([]*models.GpuDevice, error) } type NewVMServiceFunc func(ctx goctx.Context, auth infrav1.Tower, logger logr.Logger) (VMService, error) @@ -113,10 +115,8 @@ func (svr *TowerVMService) UpdateVM(vm *models.VM, elfMachine *infrav1.ElfMachin // Clone kicks off a clone operation on Elf to create a new virtual machine using VM template. func (svr *TowerVMService) Clone( - elfCluster *infrav1.ElfCluster, - machine *clusterv1.Machine, - elfMachine *infrav1.ElfMachine, - bootstrapData, host string) (*models.WithTaskVM, error) { + elfCluster *infrav1.ElfCluster, elfMachine *infrav1.ElfMachine, bootstrapData, + host string, machineGPUDevices []*models.GpuDevice) (*models.WithTaskVM, error) { cluster, err := svr.GetCluster(elfCluster.Spec.Cluster) if err != nil { return nil, err @@ -131,6 +131,20 @@ func (svr *TowerVMService) Clone( cpuCores := TowerCPUCores(*vCPU, elfMachine.Spec.NumCoresPerSocket) cpuSockets := TowerCPUSockets(*vCPU, *cpuCores) + gpuDevices := make([]*models.VMGpuOperationParams, len(machineGPUDevices)) + for i := 0; i < len(machineGPUDevices); i++ { + gpuDevices[i] = &models.VMGpuOperationParams{ + GpuID: machineGPUDevices[i].ID, + Amount: TowerInt32(1), + } + } + + ha := TowerBool(elfMachine.Spec.HA) + // HA cannot be enabled on a virtual machine with GPU/vGPU devices. + if len(gpuDevices) > 0 { + ha = TowerBool(false) + } + var mountDisks []*models.MountNewCreateDisksParams if elfMachine.Spec.DiskGiB > 0 { storagePolicy := models.VMVolumeElfStoragePolicyTypeREPLICA2THINPROVISION @@ -233,8 +247,9 @@ func (svr *TowerVMService) Clone( CPUCores: cpuCores, CPUSockets: cpuSockets, Memory: TowerMemory(elfMachine.Spec.MemoryMiB), + GpuDevices: gpuDevices, Status: models.NewVMStatus(models.VMStatusSTOPPED), - Ha: TowerBool(elfMachine.Spec.HA), + Ha: ha, IsFullCopy: TowerBool(isFullCopy), TemplateID: template.ID, GuestOsType: models.NewVMGuestsOperationSystem(models.VMGuestsOperationSystem(elfMachine.Spec.OSType)), @@ -365,6 +380,48 @@ func (svr *TowerVMService) ShutDown(id string) (*models.Task, error) { return &models.Task{ID: shutDownVMResp.Payload[0].TaskID}, nil } +func (svr *TowerVMService) RemoveGPUDevices(id string, gpus []*models.VMGpuOperationParams) (*models.Task, error) { + removeVMGpuDeviceParams := clientvm.NewRemoveVMGpuDeviceParams() + removeVMGpuDeviceParams.RequestBody = &models.VMRemoveGpuDeviceParams{ + Data: gpus, + Where: &models.VMWhereInput{ + OR: []*models.VMWhereInput{{LocalID: TowerString(id)}, {ID: TowerString(id)}}, + }, + } + + temoveVMGPUDeviceResp, err := svr.Session.VM.RemoveVMGpuDevice(removeVMGpuDeviceParams) + if err != nil { + return nil, err + } + + if len(temoveVMGPUDeviceResp.Payload) == 0 { + return nil, errors.New(VMNotFound) + } + + return &models.Task{ID: temoveVMGPUDeviceResp.Payload[0].TaskID}, nil +} + +func (svr *TowerVMService) AddGPUDevices(id string, gpus []*models.VMGpuOperationParams) (*models.Task, error) { + addVMGpuDeviceParams := clientvm.NewAddVMGpuDeviceParams() + addVMGpuDeviceParams.RequestBody = &models.VMAddGpuDeviceParams{ + Data: gpus, + Where: &models.VMWhereInput{ + OR: []*models.VMWhereInput{{LocalID: TowerString(id)}, {ID: TowerString(id)}}, + }, + } + + addVMGpuDeviceResp, err := svr.Session.VM.AddVMGpuDevice(addVMGpuDeviceParams) + if err != nil { + return nil, err + } + + if len(addVMGpuDeviceResp.Payload) == 0 { + return nil, errors.New(VMNotFound) + } + + return &models.Task{ID: addVMGpuDeviceResp.Payload[0].TaskID}, nil +} + // Get searches for a virtual machine. func (svr *TowerVMService) Get(id string) (*models.VM, error) { getVmsParams := clientvm.NewGetVmsParams() @@ -821,3 +878,47 @@ func (svr *TowerVMService) DeleteVMPlacementGroupsByName(ctx goctx.Context, plac return nil } + +func (svr *TowerVMService) FindGPUDevicesByHostIDs(hostIDs []string) ([]*models.GpuDevice, error) { + if len(hostIDs) == 0 { + return nil, nil + } + + getGpuDevicesParams := clientgpu.NewGetGpuDevicesParams() + getGpuDevicesParams.RequestBody = &models.GetGpuDevicesRequestBody{ + Where: &models.GpuDeviceWhereInput{ + UserUsage: models.NewGpuDeviceUsage(models.GpuDeviceUsagePASSTHROUGH), + Host: &models.HostWhereInput{ + IDIn: hostIDs, + }, + }, + } + + getGpuDevicesResp, err := svr.Session.GpuDevice.GetGpuDevices(getGpuDevicesParams) + if err != nil { + return nil, err + } + + return getGpuDevicesResp.Payload, nil +} + +func (svr *TowerVMService) FindGPUDevicesByIDs(gpuIDs []string) ([]*models.GpuDevice, error) { + if len(gpuIDs) == 0 { + return nil, nil + } + + getGpuDevicesParams := clientgpu.NewGetGpuDevicesParams() + getGpuDevicesParams.RequestBody = &models.GetGpuDevicesRequestBody{ + Where: &models.GpuDeviceWhereInput{ + UserUsage: models.NewGpuDeviceUsage(models.GpuDeviceUsagePASSTHROUGH), + IDIn: gpuIDs, + }, + } + + getGpuDevicesResp, err := svr.Session.GpuDevice.GetGpuDevices(getGpuDevicesParams) + if err != nil { + return nil, err + } + + return getGpuDevicesResp.Payload, nil +} diff --git a/test/e2e/data/cape/metadata.yaml b/test/e2e/data/cape/metadata.yaml index ca040e02..55eda299 100644 --- a/test/e2e/data/cape/metadata.yaml +++ b/test/e2e/data/cape/metadata.yaml @@ -6,6 +6,9 @@ apiVersion: clusterctl.cluster.x-k8s.io/v1alpha3 kind: Metadata releaseSeries: + - major: 1 + minor: 3 + contract: v1beta1 - major: 1 minor: 2 contract: v1beta1 diff --git a/test/fake/tower.go b/test/fake/tower.go index b2801762..3b6329f5 100644 --- a/test/fake/tower.go +++ b/test/fake/tower.go @@ -150,3 +150,12 @@ func NewWithTaskVMPlacementGroup(placementGroup *models.VMPlacementGroup, task * TaskID: task.ID, } } + +func NewTowerGPU() *models.GpuDevice { + return &models.GpuDevice{ + ID: pointer.String(ID()), + LocalID: pointer.String(UUID()), + Name: pointer.String(ID()), + Model: pointer.String("A16"), + } +}