Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for GPU #146

Merged
merged 4 commits into from
Sep 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions api/v1beta1/conditions_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,16 @@ const (
// are automatically re-tried by the controller.
UpdatingFailedReason = "UpdatingFailed"

// DetachingGPUFailedReason (Severity=Warning) documents an ElfMachine controller detecting
// an error while detaching GPU devices; those kind of errors are usually transient and failed provisioning
// are automatically re-tried by the controller.
DetachingGPUFailedReason = "DetachingGPUFailed"

// AttachingGPUFailedReason (Severity=Warning) documents an ElfMachine controller detecting
// an error while attaching GPU devices; those kind of errors are usually transient and failed provisioning
// are automatically re-tried by the controller.
AttachingGPUFailedReason = "AttachingGPUFailed"

// TaskFailureReason (Severity=Warning) documents an ElfMachine task failure; the reconcile look will automatically
// retry the operation, but a user intervention might be required to fix the problem.
TaskFailureReason = "TaskFailure"
Expand All @@ -95,6 +105,10 @@ const (
// WaitingForAvailableHostRequiredByPlacementGroupReason (Severity=Info) documents an ElfMachine
// waiting for an available host required by placement group to create VM.
WaitingForAvailableHostRequiredByPlacementGroupReason = "WaitingForAvailableHostRequiredByPlacementGroup"

// WaitingForAvailableHostWithEnoughGPUsReason (Severity=Info) documents an ElfMachine
// waiting for an available host with enough GPUs to create VM.
WaitingForAvailableHostWithEnoughGPUsReason = "WaitingForAvailableHostWithEnoughGPUs"
)

// Conditions and Reasons related to make connections to a Tower. Can currently be used by ElfCluster and ElfMachine
Expand Down
17 changes: 17 additions & 0 deletions api/v1beta1/elfmachine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@ type ElfMachineSpec struct {
// +optional
DiskGiB int32 `json:"diskGiB,omitempty"`

// GPUDevices is the list of GPUs used by the virtual machine.
// +optional
GPUDevices []GPUPassthroughDeviceSpec `json:"gpuDevices,omitempty"`

// VGPUDevices is the list of vGPUs used by the virtual machine.
// +optional
VGPUDevices []VGPUDeviceSpec `json:"vgpuDevices,omitempty"`

// +optional
HA bool `json:"ha,omitempty"`

Expand Down Expand Up @@ -106,6 +114,11 @@ type ElfMachineStatus struct {
// +optional
Network []NetworkStatus `json:"network,omitempty"`

// GPUDevices returns the GPU devices status for each of the machine's configured
// GPU devices.
// +optional
GPUDevices []GPUStatus `json:"gpuDevices,omitempty"`

// FailureReason will be set in the event that there is a terminal problem
// reconciling the Machine and will contain a succinct value suitable
// for machine interpretation.
Expand Down Expand Up @@ -300,6 +313,10 @@ func (m *ElfMachine) GetVMDisconnectionTimestamp() *metav1.Time {
return nil
}

func (m *ElfMachine) RequiresGPUDevices() bool {
return len(m.Spec.GPUDevices) > 0 || len(m.Spec.VGPUDevices) > 0
}

//+kubebuilder:object:root=true

// ElfMachineList contains a list of ElfMachine.
Expand Down
31 changes: 31 additions & 0 deletions api/v1beta1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,37 @@ type NetworkDeviceRouteSpec struct {
Network string `json:"network,omitempty"`
}

// GPUPassthroughDeviceSpec defines virtual machine's GPU configuration
type GPUPassthroughDeviceSpec struct {
// Model is the model name of a physical GPU, e.g. 'A16'.
Model string `json:"model,omitempty"`

// Count is the number of GPU. Defaults to 1.
// +optional
// +kubebuilder:default=1
// +kubebuilder:validation:Minimum=1
Count int32 `json:"count,omitempty"`
}

// VGPUDeviceSpec defines virtual machine's VGPU configuration
type VGPUDeviceSpec struct {
// Type is the type name of a virtual GPU, e.g. 'NVIDIA A16-16A'.
// +kubebuilder:validation:Required
Type string `json:"type,omitempty"`

// Count is the number of vGPU. Defaults to 1.
// +optional
// +kubebuilder:default=1
// +kubebuilder:validation:Minimum=1
Count int32 `json:"count,omitempty"`
}

// GPUStatus provides information about one of a VM's GPU device.
type GPUStatus struct {
jessehu marked this conversation as resolved.
Show resolved Hide resolved
GPUID string `json:"gpuId,omitempty"`
Name string `json:"name,omitempty"`
}

//+kubebuilder:object:generate=false

// PatchStringValue is for patching resources.
Expand Down
60 changes: 60 additions & 0 deletions api/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.11.4
controller-gen.kubebuilder.io/version: v0.12.0
name: elfclusters.infrastructure.cluster.x-k8s.io
spec:
group: infrastructure.cluster.x-k8s.io
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.11.4
controller-gen.kubebuilder.io/version: v0.12.0
name: elfmachines.infrastructure.cluster.x-k8s.io
spec:
group: infrastructure.cluster.x-k8s.io
Expand Down Expand Up @@ -76,6 +76,24 @@ spec:
this infrastructure provider, the name is equivalent to the name
of the ElfDeploymentZone.
type: string
gpuDevices:
description: GPUDevices is the list of GPUs used by the virtual machine.
items:
description: GPUPassthroughDeviceSpec defines virtual machine's
GPU configuration
properties:
count:
default: 1
description: Count is the number of GPU. Defaults to 1.
format: int32
minimum: 1
type: integer
model:
description: Model is the model name of a physical GPU, e.g.
'A16'.
type: string
type: object
type: array
ha:
type: boolean
host:
Expand Down Expand Up @@ -183,6 +201,24 @@ spec:
description: Template is the name or ID of the template used to clone
new machines.
type: string
vgpuDevices:
description: VGPUDevices is the list of vGPUs used by the virtual
machine.
items:
description: VGPUDeviceSpec defines virtual machine's VGPU configuration
properties:
count:
default: 1
description: Count is the number of vGPU. Defaults to 1.
format: int32
minimum: 1
type: integer
type:
description: Type is the type name of a virtual GPU, e.g. 'NVIDIA
A16-16A'.
type: string
type: object
type: array
required:
- template
type: object
Expand Down Expand Up @@ -282,6 +318,19 @@ spec:
during the reconciliation of Machines can be added as events to
the Machine object and/or logged in the controller's output."
type: string
gpuDevices:
description: GPUDevices returns the GPU devices status for each of
the machine's configured GPU devices.
items:
description: GPUStatus provides information about one of a VM's
GPU device.
properties:
gpuId:
type: string
name:
type: string
type: object
type: array
hostServerName:
description: HostServerName is the name of host server where the virtual
machine runs on. This value is set automatically at runtime and
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.11.4
controller-gen.kubebuilder.io/version: v0.12.0
name: elfmachinetemplates.infrastructure.cluster.x-k8s.io
spec:
group: infrastructure.cluster.x-k8s.io
Expand Down Expand Up @@ -56,6 +56,26 @@ spec:
API. For this infrastructure provider, the name is equivalent
to the name of the ElfDeploymentZone.
type: string
gpuDevices:
description: GPUDevices is the list of GPUs used by the virtual
machine.
items:
description: GPUPassthroughDeviceSpec defines virtual machine's
GPU configuration
properties:
count:
default: 1
description: Count is the number of GPU. Defaults to
1.
format: int32
minimum: 1
type: integer
model:
description: Model is the model name of a physical GPU,
e.g. 'A16'.
type: string
type: object
type: array
ha:
type: boolean
host:
Expand Down Expand Up @@ -165,6 +185,26 @@ spec:
description: Template is the name or ID of the template used
to clone new machines.
type: string
vgpuDevices:
description: VGPUDevices is the list of vGPUs used by the
virtual machine.
items:
description: VGPUDeviceSpec defines virtual machine's VGPU
configuration
properties:
count:
default: 1
description: Count is the number of vGPU. Defaults to
1.
format: int32
minimum: 1
type: integer
type:
description: Type is the type name of a virtual GPU,
e.g. 'NVIDIA A16-16A'.
type: string
type: object
type: array
required:
- template
type: object
Expand Down
Loading