smartxworks · haijianyang · Sep 27, 2023 · Sep 25, 2023 · Sep 26, 2023 · Sep 26, 2023
diff --git a/api/v1beta1/conditions_consts.go b/api/v1beta1/conditions_consts.go
@@ -76,6 +76,16 @@ const (
 	// are automatically re-tried by the controller.
 	UpdatingFailedReason = "UpdatingFailed"
 
+	// DetachingGPUFailedReason (Severity=Warning) documents an ElfMachine controller detecting
+	// an error while detaching GPU devices; those kind of errors are usually transient and failed provisioning
+	// are automatically re-tried by the controller.
+	DetachingGPUFailedReason = "DetachingGPUFailed"
+
+	// AttachingGPUFailedReason (Severity=Warning) documents an ElfMachine controller detecting
+	// an error while attaching GPU devices; those kind of errors are usually transient and failed provisioning
+	// are automatically re-tried by the controller.
+	AttachingGPUFailedReason = "AttachingGPUFailed"
+
 	// TaskFailureReason (Severity=Warning) documents an ElfMachine task failure; the reconcile look will automatically
 	// retry the operation, but a user intervention might be required to fix the problem.
 	TaskFailureReason = "TaskFailure"
@@ -95,6 +105,10 @@ const (
 	// WaitingForAvailableHostRequiredByPlacementGroupReason (Severity=Info) documents an ElfMachine
 	// waiting for an available host required by placement group to create VM.
 	WaitingForAvailableHostRequiredByPlacementGroupReason = "WaitingForAvailableHostRequiredByPlacementGroup"
+
+	// WaitingForAvailableHostWithEnoughGPUsReason (Severity=Info) documents an ElfMachine
+	// waiting for an available host with enough GPUs to create VM.
+	WaitingForAvailableHostWithEnoughGPUsReason = "WaitingForAvailableHostWithEnoughGPUs"
 )
 
 // Conditions and Reasons related to make connections to a Tower. Can currently be used by ElfCluster and ElfMachine

diff --git a/api/v1beta1/elfmachine_types.go b/api/v1beta1/elfmachine_types.go
@@ -75,6 +75,14 @@ type ElfMachineSpec struct {
 	// +optional
 	DiskGiB int32 `json:"diskGiB,omitempty"`
 
+	// GPUDevices is the list of GPUs used by the virtual machine.
+	// +optional
+	GPUDevices []GPUPassthroughDeviceSpec `json:"gpuDevices,omitempty"`
+
+	// VGPUDevices is the list of vGPUs used by the virtual machine.
+	// +optional
+	VGPUDevices []VGPUDeviceSpec `json:"vgpuDevices,omitempty"`
+
 	// +optional
 	HA bool `json:"ha,omitempty"`
 
@@ -106,6 +114,11 @@ type ElfMachineStatus struct {
 	// +optional
 	Network []NetworkStatus `json:"network,omitempty"`
 
+	// GPUDevices returns the GPU devices status for each of the machine's configured
+	// GPU devices.
+	// +optional
+	GPUDevices []GPUStatus `json:"gpuDevices,omitempty"`
+
 	// FailureReason will be set in the event that there is a terminal problem
 	// reconciling the Machine and will contain a succinct value suitable
 	// for machine interpretation.
@@ -300,6 +313,10 @@ func (m *ElfMachine) GetVMDisconnectionTimestamp() *metav1.Time {
 	return nil
 }
 
+func (m *ElfMachine) RequiresGPUDevices() bool {
+	return len(m.Spec.GPUDevices) > 0 || len(m.Spec.VGPUDevices) > 0
+}
+
 //+kubebuilder:object:root=true
 
 // ElfMachineList contains a list of ElfMachine.

diff --git a/api/v1beta1/types.go b/api/v1beta1/types.go
@@ -146,6 +146,37 @@ type NetworkDeviceRouteSpec struct {
 	Network string `json:"network,omitempty"`
 }
 
+// GPUPassthroughDeviceSpec defines virtual machine's GPU configuration
+type GPUPassthroughDeviceSpec struct {
+	// Model is the model name of a physical GPU, e.g. 'A16'.
+	Model string `json:"model,omitempty"`
+
+	// Count is the number of GPU. Defaults to 1.
+	// +optional
+	// +kubebuilder:default=1
+	// +kubebuilder:validation:Minimum=1
+	Count int32 `json:"count,omitempty"`
+}
+
+// VGPUDeviceSpec defines virtual machine's VGPU configuration
+type VGPUDeviceSpec struct {
+	// Type is the type name of a virtual GPU, e.g. 'NVIDIA A16-16A'.
+	// +kubebuilder:validation:Required
+	Type string `json:"type,omitempty"`
+
+	// Count is the number of vGPU. Defaults to 1.
+	// +optional
+	// +kubebuilder:default=1
+	// +kubebuilder:validation:Minimum=1
+	Count int32 `json:"count,omitempty"`
+}
+
+// GPUStatus provides information about one of a VM's GPU device.
+type GPUStatus struct {
+	GPUID string `json:"gpuId,omitempty"`
+	Name  string `json:"name,omitempty"`
+}
+
 //+kubebuilder:object:generate=false
 
 // PatchStringValue is for patching resources.

diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go
diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfclusters.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfclusters.yaml
@@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
 kind: CustomResourceDefinition
 metadata:
   annotations:
-    controller-gen.kubebuilder.io/version: v0.11.4
+    controller-gen.kubebuilder.io/version: v0.12.0
   name: elfclusters.infrastructure.cluster.x-k8s.io
 spec:
   group: infrastructure.cluster.x-k8s.io

diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachines.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachines.yaml
@@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
 kind: CustomResourceDefinition
 metadata:
   annotations:
-    controller-gen.kubebuilder.io/version: v0.11.4
+    controller-gen.kubebuilder.io/version: v0.12.0
   name: elfmachines.infrastructure.cluster.x-k8s.io
 spec:
   group: infrastructure.cluster.x-k8s.io
@@ -76,6 +76,24 @@ spec:
                   this infrastructure provider, the name is equivalent to the name
                   of the ElfDeploymentZone.
                 type: string
+              gpuDevices:
+                description: GPUDevices is the list of GPUs used by the virtual machine.
+                items:
+                  description: GPUPassthroughDeviceSpec defines virtual machine's
+                    GPU configuration
+                  properties:
+                    count:
+                      default: 1
+                      description: Count is the number of GPU. Defaults to 1.
+                      format: int32
+                      minimum: 1
+                      type: integer
+                    model:
+                      description: Model is the model name of a physical GPU, e.g.
+                        'A16'.
+                      type: string
+                  type: object
+                type: array
               ha:
                 type: boolean
               host:
@@ -183,6 +201,24 @@ spec:
                 description: Template is the name or ID of the template used to clone
                   new machines.
                 type: string
+              vgpuDevices:
+                description: VGPUDevices is the list of vGPUs used by the virtual
+                  machine.
+                items:
+                  description: VGPUDeviceSpec defines virtual machine's VGPU configuration
+                  properties:
+                    count:
+                      default: 1
+                      description: Count is the number of vGPU. Defaults to 1.
+                      format: int32
+                      minimum: 1
+                      type: integer
+                    type:
+                      description: Type is the type name of a virtual GPU, e.g. 'NVIDIA
+                        A16-16A'.
+                      type: string
+                  type: object
+                type: array
             required:
             - template
             type: object
@@ -282,6 +318,19 @@ spec:
                   during the reconciliation of Machines can be added as events to
                   the Machine object and/or logged in the controller's output."
                 type: string
+              gpuDevices:
+                description: GPUDevices returns the GPU devices status for each of
+                  the machine's configured GPU devices.
+                items:
+                  description: GPUStatus provides information about one of a VM's
+                    GPU device.
+                  properties:
+                    gpuId:
+                      type: string
+                    name:
+                      type: string
+                  type: object
+                type: array
               hostServerName:
                 description: HostServerName is the name of host server where the virtual
                   machine runs on. This value is set automatically at runtime and

diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachinetemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachinetemplates.yaml
@@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
 kind: CustomResourceDefinition
 metadata:
   annotations:
-    controller-gen.kubebuilder.io/version: v0.11.4
+    controller-gen.kubebuilder.io/version: v0.12.0
   name: elfmachinetemplates.infrastructure.cluster.x-k8s.io
 spec:
   group: infrastructure.cluster.x-k8s.io
@@ -56,6 +56,26 @@ spec:
                           API. For this infrastructure provider, the name is equivalent
                           to the name of the ElfDeploymentZone.
                         type: string
+                      gpuDevices:
+                        description: GPUDevices is the list of GPUs used by the virtual
+                          machine.
+                        items:
+                          description: GPUPassthroughDeviceSpec defines virtual machine's
+                            GPU configuration
+                          properties:
+                            count:
+                              default: 1
+                              description: Count is the number of GPU. Defaults to
+                                1.
+                              format: int32
+                              minimum: 1
+                              type: integer
+                            model:
+                              description: Model is the model name of a physical GPU,
+                                e.g. 'A16'.
+                              type: string
+                          type: object
+                        type: array
                       ha:
                         type: boolean
                       host:
@@ -165,6 +185,26 @@ spec:
                         description: Template is the name or ID of the template used
                           to clone new machines.
                         type: string
+                      vgpuDevices:
+                        description: VGPUDevices is the list of vGPUs used by the
+                          virtual machine.
+                        items:
+                          description: VGPUDeviceSpec defines virtual machine's VGPU
+                            configuration
+                          properties:
+                            count:
+                              default: 1
+                              description: Count is the number of vGPU. Defaults to
+                                1.
+                              format: int32
+                              minimum: 1
+                              type: integer
+                            type:
+                              description: Type is the type name of a virtual GPU,
+                                e.g. 'NVIDIA A16-16A'.
+                              type: string
+                          type: object
+                        type: array
                     required:
                     - template
                     type: object