diff --git a/Makefile b/Makefile index 9c116ec6..edeacb13 100644 --- a/Makefile +++ b/Makefile @@ -155,7 +155,7 @@ kustomize: ## Download kustomize locally if necessary. CONTROLLER_GEN = $(shell pwd)/bin/controller-gen controller-gen: ## Download controller-gen locally if necessary. - $(call go-get-tool,$(CONTROLLER_GEN),sigs.k8s.io/controller-tools/cmd/controller-gen@v0.13.0) + $(call go-get-tool,$(CONTROLLER_GEN),sigs.k8s.io/controller-tools/cmd/controller-gen@v0.15.0) GINKGO := $(shell pwd)/bin/ginkgo ginkgo: ## Download ginkgo locally if necessary. diff --git a/api/v1beta1/conditions_consts.go b/api/v1beta1/conditions_consts.go index 2a52625c..cabaa436 100644 --- a/api/v1beta1/conditions_consts.go +++ b/api/v1beta1/conditions_consts.go @@ -144,6 +144,23 @@ const ( // detecting an error while adding new disk capacity to root directory; those kind of errors are // usually transient and failed updating are automatically re-tried by the controller. ExpandingRootPartitionFailedReason = "ExpandingRootPartitionFailed" + + // ExpandingVMResourcesReason documents (Severity=Info) ElfMachine currently executing the + // expand resources(CPU/memory) operation. + ExpandingVMResourcesReason = "ExpandingVMResources" + + // ExpandingVMResourcesFailedReason (Severity=Warning) documents an ElfMachine controller detecting + // an error while expanding resources(CPU/memory); those kind of errors are usually transient and + // failed updating are automatically re-tried by the controller. + ExpandingVMResourcesFailedReason = "ExpandingVMResourcesFailed" + + // RestartingKubeletReason documents (Severity=Info) ElfMachine currently executing the restart kubelet operation. + RestartingKubeletReason = "RestartingKubelet" + + // RestartingKubeletFailedReason (Severity=Warning) documents an ElfMachine controller detecting + // an error while restarting kubelet; those kind of errors are usually transient and failed restarting + // are automatically re-tried by the controller. + RestartingKubeletFailedReason = "RestartingKubeletFailed" ) // Conditions and Reasons related to make connections to a Tower. Can currently be used by ElfCluster and ElfMachine diff --git a/api/v1beta1/types.go b/api/v1beta1/types.go index 6c5154cb..efd1d549 100644 --- a/api/v1beta1/types.go +++ b/api/v1beta1/types.go @@ -18,6 +18,7 @@ package v1beta1 import ( corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" ) // CloneMode is the type of clone operation used to clone a VM from a template. @@ -199,6 +200,10 @@ type GPUStatus struct { // ResourcesStatus records the resources allocated to the virtual machine. type ResourcesStatus struct { Disk int32 `json:"disk,omitempty"` + // CPUCores is the total number of CPU cores allocated for the virtual machine. + CPUCores int32 `json:"cpu,omitempty"` + // Memory is the total number of memory in MiB allocated for the virtual machine. + Memory resource.Quantity `json:"memory,omitempty"` } //+kubebuilder:object:generate=false diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index 7e292e8d..ff9c16f3 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -247,7 +247,7 @@ func (in *ElfMachineStatus) DeepCopyInto(out *ElfMachineStatus) { *out = make([]GPUStatus, len(*in)) copy(*out, *in) } - out.Resources = in.Resources + in.Resources.DeepCopyInto(&out.Resources) if in.FailureReason != nil { in, out := &in.FailureReason, &out.FailureReason *out = new(errors.MachineStatusError) @@ -487,6 +487,7 @@ func (in *NetworkStatus) DeepCopy() *NetworkStatus { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ResourcesStatus) DeepCopyInto(out *ResourcesStatus) { *out = *in + out.Memory = in.Memory.DeepCopy() } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourcesStatus. diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfclusters.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfclusters.yaml index 131e3e51..5877fe4d 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfclusters.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfclusters.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.13.0 + controller-gen.kubebuilder.io/version: v0.15.0 name: elfclusters.infrastructure.cluster.x-k8s.io spec: group: infrastructure.cluster.x-k8s.io @@ -38,14 +38,19 @@ spec: description: ElfCluster is the Schema for the elfclusters API. properties: apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources type: string kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds type: string metadata: type: object @@ -95,9 +100,9 @@ spec: type: string type: object vmGracefulShutdown: - description: VMGracefulShutdown indicates the VMs in this ElfCluster - should shutdown gracefully when deleting the VMs. Default to false - because sometimes the OS stuck when shutting down gracefully. + description: |- + VMGracefulShutdown indicates the VMs in this ElfCluster should shutdown gracefully when deleting the VMs. + Default to false because sometimes the OS stuck when shutting down gracefully. type: boolean type: object status: @@ -110,37 +115,37 @@ spec: operational state. properties: lastTransitionTime: - description: Last time the condition transitioned from one status - to another. This should be when the underlying condition changed. - If that is not known, then using the time when the API field - changed is acceptable. + description: |- + Last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when + the API field changed is acceptable. format: date-time type: string message: - description: A human readable message indicating details about - the transition. This field may be empty. + description: |- + A human readable message indicating details about the transition. + This field may be empty. type: string reason: - description: The reason for the condition's last transition - in CamelCase. The specific API may choose whether or not this - field is considered a guaranteed API. This field may not be - empty. + description: |- + The reason for the condition's last transition in CamelCase. + The specific API may choose whether or not this field is considered a guaranteed API. + This field may not be empty. type: string severity: - description: Severity provides an explicit classification of - Reason code, so the users or machines can immediately understand - the current situation and act accordingly. The Severity field - MUST be set only when Status=False. + description: |- + Severity provides an explicit classification of Reason code, so the users or machines can immediately + understand the current situation and act accordingly. + The Severity field MUST be set only when Status=False. type: string status: description: Status of the condition, one of True, False, Unknown. type: string type: - description: Type of condition in CamelCase or in foo.example.com/CamelCase. - Many .condition.type values are consistent across resources - like Available, but because arbitrary conditions can be useful - (see .node.status.conditions), the ability to deconflict is - important. + description: |- + Type of condition in CamelCase or in foo.example.com/CamelCase. + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions + can be useful (see .node.status.conditions), the ability to deconflict is important. type: string required: - lastTransitionTime diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachines.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachines.yaml index 44368eba..1f9ab988 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachines.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachines.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.13.0 + controller-gen.kubebuilder.io/version: v0.15.0 name: elfmachines.infrastructure.cluster.x-k8s.io spec: group: infrastructure.cluster.x-k8s.io @@ -49,14 +49,19 @@ spec: description: ElfMachine is the Schema for the elfmachines API. properties: apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources type: string kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds type: string metadata: type: object @@ -71,10 +76,9 @@ spec: format: int32 type: integer failureDomain: - description: FailureDomain is the failure domain unique identifier - this Machine should be attached to, as defined in Cluster API. For - this infrastructure provider, the name is equivalent to the name - of the ElfDeploymentZone. + description: |- + FailureDomain is the failure domain unique identifier this Machine should be attached to, as defined in Cluster API. + For this infrastructure provider, the name is equivalent to the name of the ElfDeploymentZone. type: string gpuDevices: description: GPUDevices is the list of physical GPUs used by the virtual @@ -98,8 +102,10 @@ spec: ha: type: boolean host: - description: Host is a unique identifier for a ELF host. Required - when cloneMode is FullClone. Defaults to AUTO_SCHEDULE. + description: |- + Host is a unique identifier for a ELF host. + Required when cloneMode is FullClone. + Defaults to AUTO_SCHEDULE. type: string memoryMiB: format: int64 @@ -112,21 +118,23 @@ spec: description: Devices is the list of network devices used by the virtual machine. items: - description: NetworkDeviceSpec defines the network configuration - for a virtual machine's network device. + description: |- + NetworkDeviceSpec defines the network configuration for a virtual machine's + network device. properties: addressesFromPools: - description: AddressesFromPools is a list of IPAddressPools - that should be assigned to IPAddressClaims. + description: |- + AddressesFromPools is a list of IPAddressPools that should be assigned + to IPAddressClaims. items: - description: TypedLocalObjectReference contains enough - information to let you locate the typed referenced object - inside the same namespace. + description: |- + TypedLocalObjectReference contains enough information to let you locate the + typed referenced object inside the same namespace. properties: apiGroup: - description: APIGroup is the group for the resource - being referenced. If APIGroup is not specified, - the specified Kind must be in the core API group. + description: |- + APIGroup is the group for the resource being referenced. + If APIGroup is not specified, the specified Kind must be in the core API group. For any other third-party types, APIGroup is required. type: string kind: @@ -142,19 +150,22 @@ spec: x-kubernetes-map-type: atomic type: array ipAddrs: - description: IPAddrs is a list of one or more IPv4 and/or - IPv6 addresses to assign to this device. Required when - DHCP4 and DHCP6 are both false. + description: |- + IPAddrs is a list of one or more IPv4 and/or IPv6 addresses to assign + to this device. + Required when DHCP4 and DHCP6 are both false. items: type: string type: array macAddr: - description: MACAddr is the MAC address used by this device. - It is generally a good idea to omit this field and allow - a MAC address to be generated. + description: |- + MACAddr is the MAC address used by this device. + It is generally a good idea to omit this field and allow a MAC address + to be generated. type: string netmask: - description: Netmask is the subnet mask used by this device. + description: |- + Netmask is the subnet mask used by this device. Required when DHCP4 is false. type: string networkType: @@ -163,9 +174,9 @@ spec: routes: description: Required when DHCP4 is false. items: - description: NetworkDeviceRouteSpec defines the network - configuration for a virtual machine's network device - route. + description: |- + NetworkDeviceRouteSpec defines the network configuration for a virtual machine's + network device route. properties: gateway: description: Gateway is the IPv4 gateway used by this @@ -189,28 +200,32 @@ spec: type: object type: array nameservers: - description: Nameservers is a list of IPv4 and/or IPv6 addresses - used as DNS nameservers. Please note that Linux allows only - three nameservers (https://linux.die.net/man/5/resolv.conf). + description: |- + Nameservers is a list of IPv4 and/or IPv6 addresses used as DNS + nameservers. + Please note that Linux allows only three nameservers (https://linux.die.net/man/5/resolv.conf). items: type: string type: array preferredAPIServerCidr: - description: PreferredAPIServeCIDR is the preferred CIDR for the - Kubernetes API server endpoint on this machine + description: |- + PreferredAPIServeCIDR is the preferred CIDR for the Kubernetes API + server endpoint on this machine type: string required: - devices type: object numCPUS: - description: NumCPUs is the number of virtual processors in a VM. - Defaults to the analogue property value in the template from which - this machine is cloned. + description: |- + NumCPUs is the number of virtual processors in a VM. + Defaults to the analogue property value in the template from which this + machine is cloned. format: int32 type: integer numCoresPerSocket: - description: NumCoresPerSocket is the number of cores among which - to distribute CPUs in this VM. + description: |- + NumCoresPerSocket is the number of cores among which to distribute CPUs + in this VM. format: int32 type: integer osType: @@ -221,7 +236,8 @@ spec: - WINDOWS type: string providerID: - description: ProviderID is the virtual machine's UUID formatted as + description: |- + ProviderID is the virtual machine's UUID formatted as elf://f0f6f65d-0786-4170-9ab9-d02187a61ad6 type: string template: @@ -277,37 +293,37 @@ spec: operational state. properties: lastTransitionTime: - description: Last time the condition transitioned from one status - to another. This should be when the underlying condition changed. - If that is not known, then using the time when the API field - changed is acceptable. + description: |- + Last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when + the API field changed is acceptable. format: date-time type: string message: - description: A human readable message indicating details about - the transition. This field may be empty. + description: |- + A human readable message indicating details about the transition. + This field may be empty. type: string reason: - description: The reason for the condition's last transition - in CamelCase. The specific API may choose whether or not this - field is considered a guaranteed API. This field may not be - empty. + description: |- + The reason for the condition's last transition in CamelCase. + The specific API may choose whether or not this field is considered a guaranteed API. + This field may not be empty. type: string severity: - description: Severity provides an explicit classification of - Reason code, so the users or machines can immediately understand - the current situation and act accordingly. The Severity field - MUST be set only when Status=False. + description: |- + Severity provides an explicit classification of Reason code, so the users or machines can immediately + understand the current situation and act accordingly. + The Severity field MUST be set only when Status=False. type: string status: description: Status of the condition, one of True, False, Unknown. type: string type: - description: Type of condition in CamelCase or in foo.example.com/CamelCase. - Many .condition.type values are consistent across resources - like Available, but because arbitrary conditions can be useful - (see .node.status.conditions), the ability to deconflict is - important. + description: |- + Type of condition in CamelCase or in foo.example.com/CamelCase. + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions + can be useful (see .node.status.conditions), the ability to deconflict is important. type: string required: - lastTransitionTime @@ -316,38 +332,51 @@ spec: type: object type: array failureMessage: - description: "FailureMessage will be set in the event that there is - a terminal problem reconciling the Machine and will contain a more - verbose string suitable for logging and human consumption. \n This - field should not be set for transitive errors that a controller - faces that are expected to be fixed automatically over time (like - service outages), but instead indicate that something is fundamentally - wrong with the Machine's spec or the configuration of the controller, - and that manual intervention is required. Examples of terminal errors - would be invalid combinations of settings in the spec, values that - are unsupported by the controller, or the responsible controller - itself being critically misconfigured. \n Any transient errors that - occur during the reconciliation of Machines can be added as events - to the Machine object and/or logged in the controller's output." + description: |- + FailureMessage will be set in the event that there is a terminal problem + reconciling the Machine and will contain a more verbose string suitable + for logging and human consumption. + + + This field should not be set for transitive errors that a controller + faces that are expected to be fixed automatically over + time (like service outages), but instead indicate that something is + fundamentally wrong with the Machine's spec or the configuration of + the controller, and that manual intervention is required. Examples + of terminal errors would be invalid combinations of settings in the + spec, values that are unsupported by the controller, or the + responsible controller itself being critically misconfigured. + + + Any transient errors that occur during the reconciliation of Machines + can be added as events to the Machine object and/or logged in the + controller's output. type: string failureReason: - description: "FailureReason will be set in the event that there is - a terminal problem reconciling the Machine and will contain a succinct - value suitable for machine interpretation. \n This field should - not be set for transitive errors that a controller faces that are - expected to be fixed automatically over time (like service outages), - but instead indicate that something is fundamentally wrong with - the Machine's spec or the configuration of the controller, and that - manual intervention is required. Examples of terminal errors would - be invalid combinations of settings in the spec, values that are - unsupported by the controller, or the responsible controller itself - being critically misconfigured. \n Any transient errors that occur - during the reconciliation of Machines can be added as events to - the Machine object and/or logged in the controller's output." + description: |- + FailureReason will be set in the event that there is a terminal problem + reconciling the Machine and will contain a succinct value suitable + for machine interpretation. + + + This field should not be set for transitive errors that a controller + faces that are expected to be fixed automatically over + time (like service outages), but instead indicate that something is + fundamentally wrong with the Machine's spec or the configuration of + the controller, and that manual intervention is required. Examples + of terminal errors would be invalid combinations of settings in the + spec, values that are unsupported by the controller, or the + responsible controller itself being critically misconfigured. + + + Any transient errors that occur during the reconciliation of Machines + can be added as events to the Machine object and/or logged in the + controller's output. type: string gpuDevices: - description: GPUDevices returns the GPU devices status for each of - the machine's configured GPU devices. + description: |- + GPUDevices returns the GPU devices status for each of the machine's configured + GPU devices. items: description: GPUStatus provides information about one of a VM's GPU device. @@ -359,25 +388,29 @@ spec: type: object type: array hostServerName: - description: HostServerName is the name of host server where the virtual - machine runs on. This value is set automatically at runtime and - should not be set or modified by users. + description: |- + HostServerName is the name of host server where the virtual machine runs on. + This value is set automatically at runtime and should not be set or + modified by users. type: string hostServerRef: - description: HostServerRef is the Tower ID of host server where the - virtual machine runs on. This value is set automatically at runtime - and should not be set or modified by users. + description: |- + HostServerRef is the Tower ID of host server where the virtual machine runs on. + This value is set automatically at runtime and should not be set or + modified by users. type: string network: - description: Network returns the network status for each of the machine's - configured network interfaces. + description: |- + Network returns the network status for each of the machine's configured + network interfaces. items: description: NetworkStatus provides information about one of a VM's networks. properties: connected: - description: Connected is a flag that indicates whether this - network is currently connected to the VM. + description: |- + Connected is a flag that indicates whether this network is currently + connected to the VM. type: boolean ipAddrs: description: IPAddrs is one or more IP addresses reported by @@ -396,9 +429,10 @@ spec: type: object type: array placementGroupRef: - description: PlacementGroupRef is the reference to the Tower PlacementGroup - which this ElfMachine belongs to. This value is set automatically - at runtime and should not be set or modified by users. + description: |- + PlacementGroupRef is the reference to the Tower PlacementGroup which this ElfMachine belongs to. + This value is set automatically at runtime and should not be set or + modified by users. type: string ready: description: Ready is true when the provider resource is ready. @@ -406,18 +440,34 @@ spec: resources: description: Resources records the resources allocated for the machine. properties: + cpu: + description: CPUCores is the total number of CPU cores allocated + for the virtual machine. + format: int32 + type: integer disk: format: int32 type: integer + memory: + anyOf: + - type: integer + - type: string + description: Memory is the total number of memory in MiB allocated + for the virtual machine. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object taskRef: - description: TaskRef is a managed object reference to a Task related - to the machine. This value is set automatically at runtime and should - not be set or modified by users. + description: |- + TaskRef is a managed object reference to a Task related to the machine. + This value is set automatically at runtime and should not be set or + modified by users. type: string vmRef: - description: VMRef is used to lookup the VM. This value is set automatically - at runtime and should not be set or modified by users. + description: |- + VMRef is used to lookup the VM. + This value is set automatically at runtime and should not be set or + modified by users. type: string type: object type: object diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachinetemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachinetemplates.yaml index 0f7cbb17..8c9c03e2 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachinetemplates.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachinetemplates.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.13.0 + controller-gen.kubebuilder.io/version: v0.15.0 name: elfmachinetemplates.infrastructure.cluster.x-k8s.io spec: group: infrastructure.cluster.x-k8s.io @@ -21,14 +21,19 @@ spec: API. properties: apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources type: string kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds type: string metadata: type: object @@ -51,10 +56,9 @@ spec: format: int32 type: integer failureDomain: - description: FailureDomain is the failure domain unique identifier - this Machine should be attached to, as defined in Cluster - API. For this infrastructure provider, the name is equivalent - to the name of the ElfDeploymentZone. + description: |- + FailureDomain is the failure domain unique identifier this Machine should be attached to, as defined in Cluster API. + For this infrastructure provider, the name is equivalent to the name of the ElfDeploymentZone. type: string gpuDevices: description: GPUDevices is the list of physical GPUs used @@ -79,8 +83,10 @@ spec: ha: type: boolean host: - description: Host is a unique identifier for a ELF host. Required - when cloneMode is FullClone. Defaults to AUTO_SCHEDULE. + description: |- + Host is a unique identifier for a ELF host. + Required when cloneMode is FullClone. + Defaults to AUTO_SCHEDULE. type: string memoryMiB: format: int64 @@ -93,23 +99,24 @@ spec: description: Devices is the list of network devices used by the virtual machine. items: - description: NetworkDeviceSpec defines the network configuration - for a virtual machine's network device. + description: |- + NetworkDeviceSpec defines the network configuration for a virtual machine's + network device. properties: addressesFromPools: - description: AddressesFromPools is a list of IPAddressPools - that should be assigned to IPAddressClaims. + description: |- + AddressesFromPools is a list of IPAddressPools that should be assigned + to IPAddressClaims. items: - description: TypedLocalObjectReference contains - enough information to let you locate the typed - referenced object inside the same namespace. + description: |- + TypedLocalObjectReference contains enough information to let you locate the + typed referenced object inside the same namespace. properties: apiGroup: - description: APIGroup is the group for the - resource being referenced. If APIGroup is - not specified, the specified Kind must be - in the core API group. For any other third-party - types, APIGroup is required. + description: |- + APIGroup is the group for the resource being referenced. + If APIGroup is not specified, the specified Kind must be in the core API group. + For any other third-party types, APIGroup is required. type: string kind: description: Kind is the type of resource @@ -126,20 +133,23 @@ spec: x-kubernetes-map-type: atomic type: array ipAddrs: - description: IPAddrs is a list of one or more IPv4 - and/or IPv6 addresses to assign to this device. + description: |- + IPAddrs is a list of one or more IPv4 and/or IPv6 addresses to assign + to this device. Required when DHCP4 and DHCP6 are both false. items: type: string type: array macAddr: - description: MACAddr is the MAC address used by - this device. It is generally a good idea to omit - this field and allow a MAC address to be generated. + description: |- + MACAddr is the MAC address used by this device. + It is generally a good idea to omit this field and allow a MAC address + to be generated. type: string netmask: - description: Netmask is the subnet mask used by - this device. Required when DHCP4 is false. + description: |- + Netmask is the subnet mask used by this device. + Required when DHCP4 is false. type: string networkType: description: NetworkType is the VM network type. @@ -147,8 +157,8 @@ spec: routes: description: Required when DHCP4 is false. items: - description: NetworkDeviceRouteSpec defines the - network configuration for a virtual machine's + description: |- + NetworkDeviceRouteSpec defines the network configuration for a virtual machine's network device route. properties: gateway: @@ -174,28 +184,32 @@ spec: type: object type: array nameservers: - description: Nameservers is a list of IPv4 and/or IPv6 - addresses used as DNS nameservers. Please note that - Linux allows only three nameservers (https://linux.die.net/man/5/resolv.conf). + description: |- + Nameservers is a list of IPv4 and/or IPv6 addresses used as DNS + nameservers. + Please note that Linux allows only three nameservers (https://linux.die.net/man/5/resolv.conf). items: type: string type: array preferredAPIServerCidr: - description: PreferredAPIServeCIDR is the preferred CIDR - for the Kubernetes API server endpoint on this machine + description: |- + PreferredAPIServeCIDR is the preferred CIDR for the Kubernetes API + server endpoint on this machine type: string required: - devices type: object numCPUS: - description: NumCPUs is the number of virtual processors in - a VM. Defaults to the analogue property value in the template - from which this machine is cloned. + description: |- + NumCPUs is the number of virtual processors in a VM. + Defaults to the analogue property value in the template from which this + machine is cloned. format: int32 type: integer numCoresPerSocket: - description: NumCoresPerSocket is the number of cores among - which to distribute CPUs in this VM. + description: |- + NumCoresPerSocket is the number of cores among which to distribute CPUs + in this VM. format: int32 type: integer osType: @@ -207,8 +221,9 @@ spec: - WINDOWS type: string providerID: - description: ProviderID is the virtual machine's UUID formatted - as elf://f0f6f65d-0786-4170-9ab9-d02187a61ad6 + description: |- + ProviderID is the virtual machine's UUID formatted as + elf://f0f6f65d-0786-4170-9ab9-d02187a61ad6 type: string template: description: Template is the name or ID of the template used diff --git a/controllers/elfmachine_controller_resources.go b/controllers/elfmachine_controller_resources.go index 1e890c43..0878dd53 100644 --- a/controllers/elfmachine_controller_resources.go +++ b/controllers/elfmachine_controller_resources.go @@ -22,6 +22,7 @@ import ( "github.com/smartxworks/cloudtower-go-sdk/v2/models" agentv1 "github.com/smartxworks/host-config-agent-api/api/v1alpha1" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" capiremote "sigs.k8s.io/cluster-api/controllers/remote" "sigs.k8s.io/cluster-api/util/conditions" @@ -47,16 +48,16 @@ func (r *ElfMachineReconciler) reconcileVMResources(ctx goctx.Context, machineCt return false, nil } - if ok, err := r.reconcieVMVolume(ctx, machineCtx, vm, infrav1.ResourcesHotUpdatedCondition); err != nil || !ok { + if ok, err := r.reconcileVMCPUAndMemory(ctx, machineCtx, vm); err != nil || !ok { return ok, err } - // Agent needs to wait for the node exists before it can run and execute commands. - if machineutil.IsUpdatingElfMachineResources(machineCtx.ElfMachine) && - machineCtx.Machine.Status.NodeInfo == nil { - log.Info("Waiting for node exists for host agent expand vm root partition") + if ok, err := r.restartKubelet(ctx, machineCtx); err != nil || !ok { + return ok, err + } - return false, nil + if ok, err := r.reconcieVMVolume(ctx, machineCtx, vm, infrav1.ResourcesHotUpdatedCondition); err != nil || !ok { + return ok, err } if ok, err := r.expandVMRootPartition(ctx, machineCtx); err != nil || !ok { @@ -164,6 +165,12 @@ func (r *ElfMachineReconciler) expandVMRootPartition(ctx goctx.Context, machineC conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingRootPartitionReason, clusterv1.ConditionSeverityInfo, "") } + if machineCtx.Machine.Status.NodeInfo == nil { + log.Info("Waiting for node exists for host agent expand vm root partition") + + return false, nil + } + kubeClient, err := capiremote.NewClusterClient(ctx, "", r.Client, client.ObjectKey{Namespace: machineCtx.Cluster.Namespace, Name: machineCtx.Cluster.Name}) if err != nil { return false, err @@ -214,3 +221,125 @@ func (r *ElfMachineReconciler) expandVMRootPartition(ctx goctx.Context, machineC return true, nil } + +// reconcileVMCPUAndMemory ensures that the vm CPU and memory are as expected. +func (r *ElfMachineReconciler) reconcileVMCPUAndMemory(ctx goctx.Context, machineCtx *context.MachineContext, vm *models.VM) (bool, error) { + machineCtx.ElfMachine.Status.Resources.CPUCores = *vm.Vcpu + machineCtx.ElfMachine.Status.Resources.Memory = *resource.NewQuantity(service.ByteToMiB(*vm.Memory)*1024*1024, resource.BinarySI) + + if !(machineCtx.ElfMachine.Spec.NumCPUs > *vm.Vcpu || + machineCtx.ElfMachine.Spec.MemoryMiB > service.ByteToMiB(*vm.Memory)) { + return true, nil + } + + reason := conditions.GetReason(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition) + if reason == "" || + (reason != infrav1.ExpandingVMResourcesReason && reason != infrav1.ExpandingVMResourcesFailedReason) { + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMResourcesReason, clusterv1.ConditionSeverityInfo, "") + + // Save the condition first, and then expand the resources capacity. + // This prevents the resources expansion from succeeding but failing to save the + // condition, causing ElfMachine to not record the condition. + return false, nil + } + + log := ctrl.LoggerFrom(ctx) + + if ok := acquireTicketForUpdatingVM(machineCtx.ElfMachine.Name); !ok { + log.V(1).Info(fmt.Sprintf("The VM operation reaches rate limit, skip updating VM %s CPU and memory", machineCtx.ElfMachine.Status.VMRef)) + + return false, nil + } + + withTaskVM, err := machineCtx.VMService.UpdateVM(vm, machineCtx.ElfMachine) + if err != nil { + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMResourcesFailedReason, clusterv1.ConditionSeverityWarning, err.Error()) + + return false, errors.Wrapf(err, "failed to trigger update CPU and memory for VM %s", *vm.Name) + } + + if reason == infrav1.ExpandingVMResourcesFailedReason { + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMResourcesReason, clusterv1.ConditionSeverityInfo, "") + } + + machineCtx.ElfMachine.SetTask(*withTaskVM.TaskID) + + log.Info("Waiting for the VM to be updated CPU and memory", "vmRef", machineCtx.ElfMachine.Status.VMRef, "taskRef", machineCtx.ElfMachine.Status.TaskRef) + + return false, nil +} + +func (r *ElfMachineReconciler) restartKubelet(ctx goctx.Context, machineCtx *context.MachineContext) (bool, error) { + log := ctrl.LoggerFrom(ctx) + + reason := conditions.GetReason(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition) + if reason == "" { + return true, nil + } else if reason != infrav1.ExpandingVMResourcesReason && + reason != infrav1.ExpandingVMResourcesFailedReason && + reason != infrav1.RestartingKubeletReason && + reason != infrav1.RestartingKubeletFailedReason { + return true, nil + } + + if reason != infrav1.RestartingKubeletFailedReason { + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.RestartingKubeletReason, clusterv1.ConditionSeverityInfo, "") + } + + // Agent needs to wait for the node exists before it can run and execute commands. + if machineCtx.Machine.Status.NodeInfo == nil { + log.Info("Waiting for node exists for host agent expand vm root partition") + + return false, nil + } + + kubeClient, err := capiremote.NewClusterClient(ctx, "", r.Client, client.ObjectKey{Namespace: machineCtx.Cluster.Namespace, Name: machineCtx.Cluster.Name}) + if err != nil { + return false, err + } + + agentJob, err := hostagent.GetHostJob(ctx, kubeClient, machineCtx.ElfMachine.Namespace, hostagent.GetRestartKubeletJobName(machineCtx.ElfMachine)) + if err != nil && !apierrors.IsNotFound(err) { + return false, err + } + + if agentJob == nil { + agentJob, err = hostagent.RestartMachineKubelet(ctx, kubeClient, machineCtx.ElfMachine) + if err != nil { + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.RestartingKubeletFailedReason, clusterv1.ConditionSeverityInfo, err.Error()) + + return false, err + } + + log.Info("Waiting for resting kubelet to expanding CPU and memory", "hostAgentJob", agentJob.Name) + + return false, nil + } + + switch agentJob.Status.Phase { + case agentv1.PhaseSucceeded: + log.Info("Expand CPU and memory succeeded", "hostAgentJob", agentJob.Name) + case agentv1.PhaseFailed: + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.RestartingKubeletFailedReason, clusterv1.ConditionSeverityWarning, agentJob.Status.FailureMessage) + log.Info("Expand CPU and memory failed, will try again after three minutes", "hostAgentJob", agentJob.Name, "failureMessage", agentJob.Status.FailureMessage) + + lastExecutionTime := agentJob.Status.LastExecutionTime + if lastExecutionTime == nil { + lastExecutionTime = &agentJob.CreationTimestamp + } + // Three minutes after the job fails, delete the job and try again. + if time.Now().After(lastExecutionTime.Add(3 * time.Minute)) { + if err := kubeClient.Delete(ctx, agentJob); err != nil { + return false, errors.Wrapf(err, "failed to delete expand CPU and memory job %s/%s for retry", agentJob.Namespace, agentJob.Name) + } + } + + return false, nil + default: + log.Info("Waiting for expanding CPU and memory job done", "hostAgentJob", agentJob.Name, "jobStatus", agentJob.Status.Phase) + + return false, nil + } + + return true, nil +} diff --git a/controllers/elfmachine_controller_resources_test.go b/controllers/elfmachine_controller_resources_test.go index 3d6adf9a..a1345fdf 100644 --- a/controllers/elfmachine_controller_resources_test.go +++ b/controllers/elfmachine_controller_resources_test.go @@ -16,6 +16,7 @@ package controllers import ( "bytes" goctx "context" + "fmt" "time" "github.com/go-logr/logr" @@ -55,6 +56,9 @@ var _ = Describe("ElfMachineReconciler", func() { mockNewVMService service.NewVMServiceFunc ) + _, err := testEnv.CreateNamespace(goctx.Background(), "sks-system") + Expect(err).NotTo(HaveOccurred()) + BeforeEach(func() { logBuffer = new(bytes.Buffer) klog.SetOutput(logBuffer) @@ -87,24 +91,6 @@ var _ = Describe("ElfMachineReconciler", func() { Expect(logBuffer.String()).To(ContainSubstring("Waiting for hot updating resources")) }) - It("should wait for node exists", func() { - conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") - ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) - vmVolume := fake.NewVMVolume(elfMachine) - vmDisk := fake.NewVMDisk(vmVolume) - vm := fake.NewTowerVMFromElfMachine(elfMachine) - vm.VMDisks = []*models.NestedVMDisk{{ID: vmDisk.ID}} - mockVMService.EXPECT().GetVMDisks([]string{*vmDisk.ID}).Return([]*models.VMDisk{vmDisk}, nil) - mockVMService.EXPECT().GetVMVolume(*vmVolume.ID).Return(vmVolume, nil) - reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} - ok, err := reconciler.reconcileVMResources(ctx, machineContext, vm) - Expect(ok).To(BeFalse()) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("Waiting for node exists for host agent expand vm root partition")) - }) - It("should mark ResourcesHotUpdatedCondition to true", func() { agentJob := newExpandRootPartitionJob(elfMachine) Expect(testEnv.CreateAndWait(ctx, agentJob)).NotTo(HaveOccurred()) @@ -261,7 +247,21 @@ var _ = Describe("ElfMachineReconciler", func() { expectConditions(elfMachine, []conditionAssertion{}) }) + It("should wait for node exists", func() { + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.expandVMRootPartition(ctx, machineContext) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for node exists for host agent expand vm root partition")) + }) + It("should create agent job to expand root partition", func() { + machine.Status.NodeInfo = &corev1.NodeSystemInfo{} conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) @@ -283,6 +283,7 @@ var _ = Describe("ElfMachineReconciler", func() { }) It("should retry when job failed", func() { + machine.Status.NodeInfo = &corev1.NodeSystemInfo{} conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") agentJob := newExpandRootPartitionJob(elfMachine) Expect(testEnv.CreateAndWait(ctx, agentJob)).NotTo(HaveOccurred()) @@ -322,6 +323,7 @@ var _ = Describe("ElfMachineReconciler", func() { }) It("should record job succeeded", func() { + machine.Status.NodeInfo = &corev1.NodeSystemInfo{} conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") agentJob := newExpandRootPartitionJob(elfMachine) Expect(testEnv.CreateAndWait(ctx, agentJob)).NotTo(HaveOccurred()) @@ -340,13 +342,186 @@ var _ = Describe("ElfMachineReconciler", func() { expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingRootPartitionReason}}) }) }) + + Context("restartKubelet", func() { + BeforeEach(func() { + var err error + kubeConfigSecret, err = helpers.NewKubeConfigSecret(testEnv, cluster.Namespace, cluster.Name) + Expect(err).ShouldNot(HaveOccurred()) + }) + + It("should not restart kubelet without restartKubelet", func() { + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.restartKubelet(ctx, machineContext) + Expect(ok).To(BeTrue()) + Expect(err).NotTo(HaveOccurred()) + expectConditions(elfMachine, []conditionAssertion{}) + }) + + It("should wait for node exists", func() { + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMResourcesReason, clusterv1.ConditionSeverityInfo, "") + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.restartKubelet(ctx, machineContext) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for node exists for host agent expand vm root partition")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.RestartingKubeletReason}}) + }) + + It("should create agent job to restart kubelet", func() { + machine.Status.NodeInfo = &corev1.NodeSystemInfo{} + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMResourcesReason, clusterv1.ConditionSeverityInfo, "") + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.restartKubelet(ctx, machineContext) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for resting kubelet to expanding CPU and memory")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.RestartingKubeletReason}}) + var agentJob *agentv1.HostOperationJob + Eventually(func() error { + var err error + agentJob, err = hostagent.GetHostJob(ctx, testEnv.Client, elfMachine.Namespace, hostagent.GetRestartKubeletJobName(elfMachine)) + return err + }, timeout).Should(BeNil()) + Expect(agentJob.Name).To(Equal(hostagent.GetRestartKubeletJobName(elfMachine))) + }) + + It("should retry when job failed", func() { + machine.Status.NodeInfo = &corev1.NodeSystemInfo{} + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMResourcesReason, clusterv1.ConditionSeverityInfo, "") + agentJob := newRestartKubelet(elfMachine) + Expect(testEnv.CreateAndWait(ctx, agentJob)).NotTo(HaveOccurred()) + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.restartKubelet(ctx, machineContext) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for expanding CPU and memory job done")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.RestartingKubeletReason}}) + + logBuffer.Reset() + Expect(testEnv.Get(ctx, client.ObjectKey{Namespace: agentJob.Namespace, Name: agentJob.Name}, agentJob)).NotTo(HaveOccurred()) + agentJobPatchSource := agentJob.DeepCopy() + agentJob.Status.Phase = agentv1.PhaseFailed + Expect(testEnv.PatchAndWait(ctx, agentJob, agentJobPatchSource)).To(Succeed()) + ok, err = reconciler.restartKubelet(ctx, machineContext) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Expand CPU and memory failed, will try again after three minutes")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.RestartingKubeletFailedReason}}) + + Expect(testEnv.Get(ctx, client.ObjectKey{Namespace: agentJob.Namespace, Name: agentJob.Name}, agentJob)).NotTo(HaveOccurred()) + agentJobPatchSource = agentJob.DeepCopy() + agentJob.Status.LastExecutionTime = &metav1.Time{Time: time.Now().Add(-3 * time.Minute).UTC()} + Expect(testEnv.PatchAndWait(ctx, agentJob, agentJobPatchSource)).To(Succeed()) + ok, err = reconciler.restartKubelet(ctx, machineContext) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + + Eventually(func() bool { + err := testEnv.Get(ctx, client.ObjectKey{Namespace: agentJob.Namespace, Name: agentJob.Name}, agentJob) + return apierrors.IsNotFound(err) + }, timeout).Should(BeTrue()) + }) + + It("should record job succeeded", func() { + machine.Status.NodeInfo = &corev1.NodeSystemInfo{} + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMResourcesReason, clusterv1.ConditionSeverityInfo, "") + agentJob := newRestartKubelet(elfMachine) + Expect(testEnv.CreateAndWait(ctx, agentJob)).NotTo(HaveOccurred()) + Expect(testEnv.Get(ctx, client.ObjectKey{Namespace: agentJob.Namespace, Name: agentJob.Name}, agentJob)).NotTo(HaveOccurred()) + agentJobPatchSource := agentJob.DeepCopy() + agentJob.Status.Phase = agentv1.PhaseSucceeded + Expect(testEnv.PatchAndWait(ctx, agentJob, agentJobPatchSource)).To(Succeed()) + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.restartKubelet(ctx, machineContext) + Expect(ok).To(BeTrue()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Expand CPU and memory succeeded")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.RestartingKubeletReason}}) + }) + }) + + Context("reconcileVMCPUAndMemory", func() { + It("should not reconcile when numCPUs or memory is excepted", func() { + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineCtx := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + vm := fake.NewTowerVMFromElfMachine(elfMachine) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.reconcileVMCPUAndMemory(ctx, machineCtx, vm) + Expect(ok).To(BeTrue()) + Expect(err).NotTo(HaveOccurred()) + Expect(elfMachine.Status.Resources.CPUCores).To(Equal(*vm.Vcpu)) + Expect(elfMachine.Status.Resources.Memory.String()).To(Equal(fmt.Sprintf("%dMi", service.ByteToMiB(*vm.Memory)))) + }) + + It("should save the conditionType first", func() { + vm := fake.NewTowerVMFromElfMachine(elfMachine) + elfMachine.Spec.NumCPUs += 1 + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineCtx := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.reconcileVMCPUAndMemory(ctx, machineCtx, vm) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingVMResourcesReason}}) + }) + + It("should wait task done", func() { + vm := fake.NewTowerVMFromElfMachine(elfMachine) + elfMachine.Spec.MemoryMiB += 1 + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMResourcesReason, clusterv1.ConditionSeverityInfo, "") + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineCtx := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + mockVMService.EXPECT().UpdateVM(vm, elfMachine).Return(nil, unexpectedError) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.reconcileVMCPUAndMemory(ctx, machineCtx, vm) + Expect(ok).To(BeFalse()) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to trigger update CPU and memory for VM")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.ExpandingVMResourcesFailedReason}}) + + logBuffer.Reset() + inMemoryCache.Flush() + task := fake.NewTowerTask() + withTaskVM := fake.NewWithTaskVM(vm, task) + mockVMService.EXPECT().UpdateVM(vm, elfMachine).Return(withTaskVM, nil) + reconciler = &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err = reconciler.reconcileVMCPUAndMemory(ctx, machineCtx, vm) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for the VM to be updated CPU and memory")) + Expect(elfMachine.Status.TaskRef).To(Equal(*task.ID)) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingVMResourcesReason}}) + }) + }) }) func newExpandRootPartitionJob(elfMachine *infrav1.ElfMachine) *agentv1.HostOperationJob { return &agentv1.HostOperationJob{ ObjectMeta: metav1.ObjectMeta{ Name: hostagent.GetExpandRootPartitionJobName(elfMachine), - Namespace: "default", + Namespace: "sks-system", }, Spec: agentv1.HostOperationJobSpec{ NodeName: elfMachine.Name, @@ -360,3 +535,22 @@ func newExpandRootPartitionJob(elfMachine *infrav1.ElfMachine) *agentv1.HostOper }, } } + +func newRestartKubelet(elfMachine *infrav1.ElfMachine) *agentv1.HostOperationJob { + return &agentv1.HostOperationJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: hostagent.GetRestartKubeletJobName(elfMachine), + Namespace: "sks-system", + }, + Spec: agentv1.HostOperationJobSpec{ + NodeName: elfMachine.Name, + Operation: agentv1.Operation{ + Ansible: &agentv1.Ansible{ + LocalPlaybookText: &agentv1.YAMLText{ + Inline: tasks.RestartKubeletTask, + }, + }, + }, + }, + } +} diff --git a/controllers/elfmachinetemplate_controller.go b/controllers/elfmachinetemplate_controller.go index 558fe747..3892862f 100644 --- a/controllers/elfmachinetemplate_controller.go +++ b/controllers/elfmachinetemplate_controller.go @@ -147,14 +147,7 @@ func (r *ElfMachineTemplateReconciler) Reconcile(ctx goctx.Context, req ctrl.Req // reconcileMachineResources ensures that the resources(disk capacity) of the // virtual machines are the same as expected by ElfMachine. -// TODO: CPU and memory will be supported in the future. func (r *ElfMachineTemplateReconciler) reconcileMachineResources(ctx goctx.Context, emtCtx *context.MachineTemplateContext) (reconcile.Result, error) { - // The disk size is 0, it means the disk size is the same as the virtual machine template. - // So if the capacity is 0, it means that the disk size has not changed and returns directly. - if emtCtx.ElfMachineTemplate.Spec.Template.Spec.DiskGiB == 0 { - return reconcile.Result{}, nil - } - if ok, err := r.reconcileCPResources(ctx, emtCtx); err != nil { return reconcile.Result{}, err } else if !ok { @@ -495,12 +488,11 @@ func (r *ElfMachineTemplateReconciler) markElfMachinesToBeUpdatedResources(ctx g return err } - // Ensure resources are up to date. - orignalDiskGiB := elfMachine.Spec.DiskGiB - elfMachine.Spec.DiskGiB = elfMachineTemplate.Spec.Template.Spec.DiskGiB + orignalDiskGiB, orignalMemoryMiB, orignalNumCPUs, orignalNumCoresPerSocket := ensureResourcesUpToDate(elfMachine, elfMachineTemplate) conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "") - log.Info(fmt.Sprintf("Resources of ElfMachine is not up to date, marking for updating resources(disk: %d -> %d)", orignalDiskGiB, elfMachine.Spec.DiskGiB), "elfMachine", elfMachine.Name) + log.Info(fmt.Sprintf("Resources of ElfMachine is not up to date, marking for updating resources(disk: %d -> %d, memory: %d -> %d, cpu: %d -> %d, numCoresPerSocket: %d -> %d)", + orignalDiskGiB, elfMachine.Spec.DiskGiB, orignalMemoryMiB, elfMachine.Spec.MemoryMiB, orignalNumCPUs, elfMachine.Spec.NumCPUs, orignalNumCoresPerSocket, elfMachine.Spec.NumCoresPerSocket), "elfMachine", elfMachine.Name) if err := patchHelper.Patch(ctx, elfMachine); err != nil { return errors.Wrapf(err, "failed to patch ElfMachine %s to mark for updating resources", elfMachine.Name) @@ -526,12 +518,11 @@ func (r *ElfMachineTemplateReconciler) markElfMachinesResourcesNotUpToDate(ctx g return err } - // Ensure resources are up to date. - orignalDiskGiB := elfMachine.Spec.DiskGiB - elfMachine.Spec.DiskGiB = elfMachineTemplate.Spec.Template.Spec.DiskGiB + orignalDiskGiB, orignalMemoryMiB, orignalNumCPUs, orignalNumCoresPerSocket := ensureResourcesUpToDate(elfMachine, elfMachineTemplate) conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, anotherMachineHotUpdateInProgressMessage) - log.Info(fmt.Sprintf("Resources of ElfMachine is not up to date, marking for resources not up to date and waiting for hot updating resources(disk: %d -> %d)", orignalDiskGiB, elfMachine.Spec.DiskGiB), "elfMachine", elfMachine.Name) + log.Info(fmt.Sprintf("Resources of ElfMachine is not up to date, marking for resources not up to date and waiting for hot updating resources(disk: %d -> %d, memory: %d -> %d, cpu: %d -> %d, numCoresPerSocket: %d -> %d)", + orignalDiskGiB, elfMachine.Spec.DiskGiB, orignalMemoryMiB, elfMachine.Spec.MemoryMiB, orignalNumCPUs, elfMachine.Spec.NumCPUs, orignalNumCoresPerSocket, elfMachine.Spec.NumCoresPerSocket), "elfMachine", elfMachine.Name) if err := patchHelper.Patch(ctx, elfMachine); err != nil { return errors.Wrapf(err, "failed to patch ElfMachine %s to mark for resources not up to date", elfMachine.Name) @@ -540,3 +531,17 @@ func (r *ElfMachineTemplateReconciler) markElfMachinesResourcesNotUpToDate(ctx g return nil } + +// ensureResourcesUpToDate ensures resources are up to date. +func ensureResourcesUpToDate(elfMachine *infrav1.ElfMachine, elfMachineTemplate *infrav1.ElfMachineTemplate) (int32, int64, int32, int32) { + orignalDiskGiB := elfMachine.Spec.DiskGiB + elfMachine.Spec.DiskGiB = elfMachineTemplate.Spec.Template.Spec.DiskGiB + orignalMemoryMiB := elfMachine.Spec.MemoryMiB + elfMachine.Spec.MemoryMiB = elfMachineTemplate.Spec.Template.Spec.MemoryMiB + orignalNumCPUs := elfMachine.Spec.NumCPUs + elfMachine.Spec.NumCPUs = elfMachineTemplate.Spec.Template.Spec.NumCPUs + orignalNumCoresPerSocket := elfMachine.Spec.NumCoresPerSocket + elfMachine.Spec.NumCoresPerSocket = elfMachineTemplate.Spec.Template.Spec.NumCoresPerSocket + + return orignalDiskGiB, orignalMemoryMiB, orignalNumCPUs, orignalNumCoresPerSocket +} diff --git a/controllers/elfmachinetemplate_controller_test.go b/controllers/elfmachinetemplate_controller_test.go index e3b5605a..647a4b47 100644 --- a/controllers/elfmachinetemplate_controller_test.go +++ b/controllers/elfmachinetemplate_controller_test.go @@ -16,6 +16,7 @@ package controllers import ( "bytes" "fmt" + "math/rand" "time" . "github.com/onsi/ginkgo/v2" @@ -157,7 +158,7 @@ var _ = Describe("ElfMachineTemplateReconciler", func() { Expect(logBuffer.String()).To(ContainSubstring(fmt.Sprintf("ElfMachines resources of md %s are up to date", klog.KObj(md)))) logBuffer.Reset() - elfMachine.Spec.DiskGiB -= 1 + setResourcesNoUpToDate(elfMachine, emt) ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, md) fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) mtCtx = newMachineTemplateContext(elfCluster, cluster, emt) @@ -169,19 +170,21 @@ var _ = Describe("ElfMachineTemplateReconciler", func() { Expect(logBuffer.String()).To(ContainSubstring("Resources of ElfMachine is not up to date, marking for updating resources")) Expect(logBuffer.String()).To(ContainSubstring("Waiting for worker ElfMachines to be updated resources")) - // logBuffer.Reset() - // elfMachine.Spec.DiskGiB -= 1 - // updatingElfMachine, updatingMachine := fake.NewMachineObjects(elfCluster, cluster) - // fake.ToWorkerMachine(updatingElfMachine, md) - // fake.ToWorkerMachine(updatingMachine, md) - // fake.SetElfMachineTemplateForElfMachine(updatingElfMachine, emt) - // ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, md, updatingElfMachine, updatingMachine) - // fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - // fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, updatingElfMachine, updatingMachine) - // mtCtx = newMachineTemplateContext(elfCluster, cluster, emt) - // reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - // ok, err = reconciler.reconcileWorkerResources(ctx, mtCtx) - // Expect(logBuffer.String()).To(ContainSubstring("Resources of ElfMachine is not up to date, marking for updating resources")) + logBuffer.Reset() + setResourcesNoUpToDate(elfMachine, emt) + updatingElfMachine, updatingMachine := fake.NewMachineObjects(elfCluster, cluster) + fake.ToWorkerMachine(updatingElfMachine, md) + fake.ToWorkerMachine(updatingMachine, md) + fake.SetElfMachineTemplateForElfMachine(updatingElfMachine, emt) + ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, md, updatingElfMachine, updatingMachine) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, updatingElfMachine, updatingMachine) + mtCtx = newMachineTemplateContext(elfCluster, cluster, emt) + reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + ok, err = reconciler.reconcileWorkerResources(ctx, mtCtx) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Resources of ElfMachine is not up to date, marking for updating resources")) }) It("selectToBeUpdatedAndNeedUpdatedElfMachines", func() { @@ -214,7 +217,7 @@ var _ = Describe("ElfMachineTemplateReconciler", func() { InfrastructureRef: corev1.ObjectReference{Namespace: emt.Namespace, Name: "notfoud"}, } cluster.Spec.ControlPlaneRef = &corev1.ObjectReference{Namespace: kcp.Namespace, Name: kcp.Name} - elfMachine.Spec.DiskGiB -= 1 + setResourcesNoUpToDate(elfMachine, emt) ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kcp) fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) mtCtx := newMachineTemplateContext(elfCluster, cluster, emt) @@ -257,7 +260,7 @@ var _ = Describe("ElfMachineTemplateReconciler", func() { kcp.Status.UpdatedReplicas = 2 fake.ToControlPlaneMachine(elfMachine, kcp) fake.ToControlPlaneMachine(machine, kcp) - elfMachine.Spec.DiskGiB -= 1 + setResourcesNoUpToDate(elfMachine, emt) machine.Status.NodeRef = &corev1.ObjectReference{} conditions.MarkTrue(machine, controlplanev1.MachineAPIServerPodHealthyCondition) conditions.MarkTrue(machine, controlplanev1.MachineControllerManagerPodHealthyCondition) @@ -426,9 +429,15 @@ var _ = Describe("ElfMachineTemplateReconciler", func() { emt := fake.NewElfMachineTemplate() upToDateElfMachine, upToDateMachine := fake.NewMachineObjects(elfCluster, cluster) fake.SetElfMachineTemplateForElfMachine(upToDateElfMachine, emt) - noUpToDateElfMachine, noUpToDateMachine := fake.NewMachineObjects(elfCluster, cluster) - fake.SetElfMachineTemplateForElfMachine(noUpToDateElfMachine, emt) - noUpToDateElfMachine.Spec.DiskGiB -= 1 + diskNoUpToDateElfMachine, diskNoUpToDateMachine := fake.NewMachineObjects(elfCluster, cluster) + fake.SetElfMachineTemplateForElfMachine(diskNoUpToDateElfMachine, emt) + diskNoUpToDateElfMachine.Spec.DiskGiB -= 1 + memoryNoUpToDateElfMachine, memoryNoUpToDateMachine := fake.NewMachineObjects(elfCluster, cluster) + fake.SetElfMachineTemplateForElfMachine(memoryNoUpToDateElfMachine, emt) + memoryNoUpToDateElfMachine.Spec.MemoryMiB -= 1 + cpuNoUpToDateElfMachine, cpuNoUpToDateMachine := fake.NewMachineObjects(elfCluster, cluster) + fake.SetElfMachineTemplateForElfMachine(cpuNoUpToDateElfMachine, emt) + cpuNoUpToDateElfMachine.Spec.NumCPUs -= 1 updatingElfMachine, updatingMachine := fake.NewMachineObjects(elfCluster, cluster) fake.SetElfMachineTemplateForElfMachine(updatingElfMachine, emt) conditions.MarkFalse(updatingElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "") @@ -438,21 +447,25 @@ var _ = Describe("ElfMachineTemplateReconciler", func() { failedMachine.Status.Phase = string(clusterv1.MachinePhaseFailed) ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, upToDateElfMachine, upToDateMachine, - noUpToDateElfMachine, noUpToDateMachine, + diskNoUpToDateElfMachine, diskNoUpToDateMachine, + memoryNoUpToDateElfMachine, memoryNoUpToDateMachine, + cpuNoUpToDateElfMachine, cpuNoUpToDateMachine, updatingElfMachine, updatingMachine, failedElfMachine, failedMachine, ) fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, upToDateElfMachine, upToDateMachine) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, noUpToDateElfMachine, noUpToDateMachine) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, diskNoUpToDateElfMachine, upToDateMachine) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, memoryNoUpToDateElfMachine, memoryNoUpToDateMachine) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, cpuNoUpToDateElfMachine, cpuNoUpToDateMachine) fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, updatingElfMachine, updatingMachine) fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, failedElfMachine, failedMachine) reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - elfMachines := []*infrav1.ElfMachine{upToDateElfMachine, noUpToDateElfMachine, updatingElfMachine, failedElfMachine} + elfMachines := []*infrav1.ElfMachine{upToDateElfMachine, diskNoUpToDateElfMachine, updatingElfMachine, failedElfMachine} updatingResourcesElfMachines, needUpdatedResourcesElfMachines, err := reconciler.selectResourcesNotUpToDateElfMachines(ctx, emt, elfMachines) Expect(err).NotTo(HaveOccurred()) Expect(updatingResourcesElfMachines).To(Equal([]*infrav1.ElfMachine{updatingElfMachine})) - Expect(needUpdatedResourcesElfMachines).To(Equal([]*infrav1.ElfMachine{noUpToDateElfMachine})) + Expect(needUpdatedResourcesElfMachines).To(Equal([]*infrav1.ElfMachine{diskNoUpToDateElfMachine})) }) }) @@ -460,7 +473,7 @@ var _ = Describe("ElfMachineTemplateReconciler", func() { It("should mark resources to be updated", func() { emt := fake.NewElfMachineTemplate() fake.SetElfMachineTemplateForElfMachine(elfMachine, emt) - elfMachine.Spec.DiskGiB -= 1 + setResourcesNoUpToDate(elfMachine, emt) ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} @@ -488,7 +501,7 @@ var _ = Describe("ElfMachineTemplateReconciler", func() { expectConditions(elfMachine, []conditionAssertion{}) logBuffer.Reset() - elfMachine.Spec.DiskGiB -= 1 + setResourcesNoUpToDate(elfMachine, emt) ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} @@ -505,3 +518,17 @@ var _ = Describe("ElfMachineTemplateReconciler", func() { }) }) }) + +func setResourcesNoUpToDate(elfMachine *infrav1.ElfMachine, emt *infrav1.ElfMachineTemplate) { + fake.SetElfMachineTemplateForElfMachine(elfMachine, emt) + seededRand := rand.New(rand.NewSource(time.Now().UnixNano())) + n := seededRand.Intn(3) + switch n { + case 0: + elfMachine.Spec.DiskGiB -= 1 + case 1: + elfMachine.Spec.MemoryMiB -= 1 + default: + elfMachine.Spec.NumCPUs -= 1 + } +} diff --git a/pkg/hostagent/service.go b/pkg/hostagent/service.go index 2518f230..bcbb9086 100644 --- a/pkg/hostagent/service.go +++ b/pkg/hostagent/service.go @@ -33,7 +33,7 @@ func GetHostJob(ctx goctx.Context, c client.Client, namespace, name string) (*ag var restartKubeletJob agentv1.HostOperationJob if err := c.Get(ctx, apitypes.NamespacedName{ Name: name, - Namespace: "default", + Namespace: "sks-system", }, &restartKubeletJob); err != nil { return nil, err } @@ -47,11 +47,15 @@ func GetExpandRootPartitionJobName(elfMachine *infrav1.ElfMachine) string { return fmt.Sprintf("cape-expand-root-partition-%s-%d", elfMachine.Name, elfMachine.Spec.DiskGiB) } +func GetRestartKubeletJobName(elfMachine *infrav1.ElfMachine) string { + return fmt.Sprintf("cape-restart-kubelet-%s-%d-%d-%d", elfMachine.Name, elfMachine.Spec.NumCPUs, elfMachine.Spec.NumCoresPerSocket, elfMachine.Spec.MemoryMiB) +} + func ExpandRootPartition(ctx goctx.Context, c client.Client, elfMachine *infrav1.ElfMachine) (*agentv1.HostOperationJob, error) { agentJob := &agentv1.HostOperationJob{ ObjectMeta: metav1.ObjectMeta{ Name: GetExpandRootPartitionJobName(elfMachine), - Namespace: "default", + Namespace: "sks-system", }, Spec: agentv1.HostOperationJobSpec{ NodeName: elfMachine.Name, @@ -72,3 +76,29 @@ func ExpandRootPartition(ctx goctx.Context, c client.Client, elfMachine *infrav1 return agentJob, nil } + +func RestartMachineKubelet(ctx goctx.Context, c client.Client, elfMachine *infrav1.ElfMachine) (*agentv1.HostOperationJob, error) { + restartKubeletJob := &agentv1.HostOperationJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: GetRestartKubeletJobName(elfMachine), + Namespace: "sks-system", + }, + Spec: agentv1.HostOperationJobSpec{ + NodeName: elfMachine.Name, + Operation: agentv1.Operation{ + Ansible: &agentv1.Ansible{ + LocalPlaybookText: &agentv1.YAMLText{ + Inline: tasks.RestartKubeletTask, + }, + }, + Timeout: metav1.Duration{Duration: defaultTimeout}, + }, + }, + } + + if err := c.Create(ctx, restartKubeletJob); err != nil { + return nil, err + } + + return restartKubeletJob, nil +} diff --git a/pkg/hostagent/tasks/restart_kubelet.yaml b/pkg/hostagent/tasks/restart_kubelet.yaml new file mode 100644 index 00000000..d2705247 --- /dev/null +++ b/pkg/hostagent/tasks/restart_kubelet.yaml @@ -0,0 +1,9 @@ +--- +- name: Kubelet | restart kubelet + hosts: all + become: true + gather_facts: false + tasks: + - ansible.builtin.service: + name: kubelet + state: restarted \ No newline at end of file diff --git a/pkg/hostagent/tasks/tasks.go b/pkg/hostagent/tasks/tasks.go index 8a38d3df..7cffa99d 100644 --- a/pkg/hostagent/tasks/tasks.go +++ b/pkg/hostagent/tasks/tasks.go @@ -21,3 +21,8 @@ import ( // //go:embed expand_root_partition.yaml var ExpandRootPartitionTask string + +// RestartKubeletTask is the task to restart kubelet. +// +//go:embed restart_kubelet.yaml +var RestartKubeletTask string diff --git a/pkg/service/util.go b/pkg/service/util.go index 6eed2517..c0bd17ca 100644 --- a/pkg/service/util.go +++ b/pkg/service/util.go @@ -34,14 +34,14 @@ import ( func GetUpdatedVMRestrictedFields(vm *models.VM, elfMachine *infrav1.ElfMachine) map[string]string { fieldMap := make(map[string]string) vCPU := TowerVCPU(elfMachine.Spec.NumCPUs) - cpuCores := TowerCPUCores(*vCPU, elfMachine.Spec.NumCoresPerSocket) - cpuSockets := TowerCPUSockets(*vCPU, *cpuCores) + cpuSocketCores := TowerCPUSocketCores(elfMachine.Spec.NumCoresPerSocket, *vCPU) + cpuSockets := TowerCPUSockets(*vCPU, *cpuSocketCores) if *vm.Vcpu > *vCPU { fieldMap["vcpu"] = fmt.Sprintf("actual: %d, expected: %d", *vm.Vcpu, *vCPU) } - if *vm.CPU.Cores > *cpuCores { - fieldMap["cpuCores"] = fmt.Sprintf("actual: %d, expected: %d", *vm.CPU.Cores, *cpuCores) + if *vm.CPU.Cores > *cpuSocketCores { + fieldMap["cpuCores"] = fmt.Sprintf("actual: %d, expected: %d", *vm.CPU.Cores, *cpuSocketCores) } if *vm.CPU.Sockets > *cpuSockets { fieldMap["cpuSockets"] = fmt.Sprintf("actual: %d, expected: %d", *vm.CPU.Sockets, *cpuSockets) @@ -136,16 +136,16 @@ func TowerVCPU(vCPU int32) *int32 { return &vCPU } -func TowerCPUCores(cpuCores, vCPU int32) *int32 { - if cpuCores <= 0 { - cpuCores = vCPU +func TowerCPUSocketCores(cpuSocketCores, vCPU int32) *int32 { + if cpuSocketCores <= 0 { + cpuSocketCores = vCPU } - return &cpuCores + return &cpuSocketCores } -func TowerCPUSockets(vCPU, cpuCores int32) *int32 { - cpuSockets := vCPU / cpuCores +func TowerCPUSockets(vCPU, cpuSocketCores int32) *int32 { + cpuSockets := vCPU / cpuSocketCores return &cpuSockets } @@ -154,6 +154,10 @@ func ByteToGiB(bytes int64) int32 { return int32(bytes / 1024 / 1024 / 1024) } +func ByteToMiB(bytes int64) int64 { + return bytes / 1024 / 1024 +} + func IsVMInRecycleBin(vm *models.VM) bool { return vm.InRecycleBin != nil && *vm.InRecycleBin } diff --git a/pkg/service/vm.go b/pkg/service/vm.go index bb93fa30..0ff9a462 100644 --- a/pkg/service/vm.go +++ b/pkg/service/vm.go @@ -102,15 +102,17 @@ type TowerVMService struct { func (svr *TowerVMService) UpdateVM(vm *models.VM, elfMachine *infrav1.ElfMachine) (*models.WithTaskVM, error) { vCPU := TowerVCPU(elfMachine.Spec.NumCPUs) - cpuCores := TowerCPUCores(*vCPU, elfMachine.Spec.NumCoresPerSocket) - cpuSockets := TowerCPUSockets(*vCPU, *cpuCores) + cpuSocketCores := TowerCPUSocketCores(elfMachine.Spec.NumCoresPerSocket, *vCPU) + cpuSockets := TowerCPUSockets(*vCPU, *cpuSocketCores) + memory := TowerMemory(elfMachine.Spec.MemoryMiB) updateVMParams := clientvm.NewUpdateVMParams() updateVMParams.RequestBody = &models.VMUpdateParams{ Data: &models.VMUpdateParamsData{ Vcpu: vCPU, - CPUCores: cpuCores, + CPUCores: cpuSocketCores, CPUSockets: cpuSockets, + Memory: memory, }, Where: &models.VMWhereInput{ID: TowerString(*vm.ID)}, } @@ -188,8 +190,8 @@ func (svr *TowerVMService) Clone( } vCPU := TowerVCPU(elfMachine.Spec.NumCPUs) - cpuCores := TowerCPUCores(*vCPU, elfMachine.Spec.NumCoresPerSocket) - cpuSockets := TowerCPUSockets(*vCPU, *cpuCores) + cpuSocketCores := TowerCPUSocketCores(elfMachine.Spec.NumCoresPerSocket, *vCPU) + cpuSockets := TowerCPUSockets(*vCPU, *cpuSocketCores) gpuDevices := make([]*models.VMGpuOperationParams, len(gpuDeviceInfos)) for i := 0; i < len(gpuDeviceInfos); i++ { @@ -299,7 +301,7 @@ func (svr *TowerVMService) Clone( Description: TowerString(fmt.Sprintf(config.VMDescription, elfCluster.Spec.Tower.Server)), Owner: owner, Vcpu: vCPU, - CPUCores: cpuCores, + CPUCores: cpuSocketCores, CPUSockets: cpuSockets, Memory: TowerMemory(elfMachine.Spec.MemoryMiB), GpuDevices: gpuDevices, diff --git a/pkg/util/machine/machine.go b/pkg/util/machine/machine.go index 53772b88..c51968c7 100644 --- a/pkg/util/machine/machine.go +++ b/pkg/util/machine/machine.go @@ -186,7 +186,13 @@ func IsUpdatingElfMachineResources(elfMachine *infrav1.ElfMachine) bool { } func IsResourcesUpToDate(elfMachineTemplate *infrav1.ElfMachineTemplate, elfMachine *infrav1.ElfMachine) bool { - return elfMachineTemplate.Spec.Template.Spec.DiskGiB <= elfMachine.Spec.DiskGiB + if elfMachine.Spec.DiskGiB >= elfMachineTemplate.Spec.Template.Spec.DiskGiB && + elfMachine.Spec.MemoryMiB >= elfMachineTemplate.Spec.Template.Spec.MemoryMiB && + elfMachine.Spec.NumCPUs >= elfMachineTemplate.Spec.Template.Spec.NumCPUs { + return true + } + + return false } func NeedUpdateElfMachineResources(elfMachineTemplate *infrav1.ElfMachineTemplate, elfMachine *infrav1.ElfMachine) bool { diff --git a/pkg/util/machine/machine_test.go b/pkg/util/machine/machine_test.go index 1a338fa7..83a8fc82 100644 --- a/pkg/util/machine/machine_test.go +++ b/pkg/util/machine/machine_test.go @@ -306,6 +306,16 @@ func TestIsResourcesUpToDate(t *testing.T) { g.Expect(IsResourcesUpToDate(emt, elfMachine)).To(gomega.BeTrue()) elfMachine.Spec.DiskGiB -= 1 g.Expect(IsResourcesUpToDate(emt, elfMachine)).To(gomega.BeFalse()) + + fake.SetElfMachineTemplateForElfMachine(elfMachine, emt) + g.Expect(IsResourcesUpToDate(emt, elfMachine)).To(gomega.BeTrue()) + elfMachine.Spec.MemoryMiB -= 1 + g.Expect(IsResourcesUpToDate(emt, elfMachine)).To(gomega.BeFalse()) + + fake.SetElfMachineTemplateForElfMachine(elfMachine, emt) + g.Expect(IsResourcesUpToDate(emt, elfMachine)).To(gomega.BeTrue()) + elfMachine.Spec.NumCPUs -= 1 + g.Expect(IsResourcesUpToDate(emt, elfMachine)).To(gomega.BeFalse()) } func toString(s string) *string { diff --git a/test/fake/tower.go b/test/fake/tower.go index 4c76575e..0117211d 100644 --- a/test/fake/tower.go +++ b/test/fake/tower.go @@ -78,8 +78,8 @@ func NewTowerVMFromElfMachine(elfMachine *infrav1.ElfMachine) *models.VM { vm.Name = service.TowerString(elfMachine.Name) vm.Vcpu = service.TowerVCPU(elfMachine.Spec.NumCPUs) vm.CPU = &models.NestedCPU{ - Cores: service.TowerCPUCores(*vm.Vcpu, elfMachine.Spec.NumCoresPerSocket), - Sockets: service.TowerCPUSockets(*vm.Vcpu, *service.TowerCPUCores(*vm.Vcpu, elfMachine.Spec.NumCoresPerSocket)), + Cores: service.TowerCPUSocketCores(*vm.Vcpu, elfMachine.Spec.NumCoresPerSocket), + Sockets: service.TowerCPUSockets(*vm.Vcpu, *service.TowerCPUSocketCores(*vm.Vcpu, elfMachine.Spec.NumCoresPerSocket)), } vm.Memory = service.TowerMemory(elfMachine.Spec.MemoryMiB) vm.Ha = service.TowerBool(elfMachine.Spec.HA) diff --git a/webhooks/elfmachine_webhook_mutation.go b/webhooks/elfmachine_webhook_mutation.go index 9c53b92d..477ff26e 100644 --- a/webhooks/elfmachine_webhook_mutation.go +++ b/webhooks/elfmachine_webhook_mutation.go @@ -63,6 +63,10 @@ func (m *ElfMachineMutation) Handle(ctx goctx.Context, request admission.Request version.SetCurrentCAPEVersion(&elfMachine) } + if elfMachine.Spec.NumCoresPerSocket <= 0 { + elfMachine.Spec.NumCoresPerSocket = elfMachine.Spec.NumCPUs + } + if marshaledElfMachine, err := json.Marshal(elfMachine); err != nil { return admission.Errored(http.StatusInternalServerError, err) } else { diff --git a/webhooks/elfmachine_webhook_mutation_test.go b/webhooks/elfmachine_webhook_mutation_test.go index 8fd51214..bde2321b 100644 --- a/webhooks/elfmachine_webhook_mutation_test.go +++ b/webhooks/elfmachine_webhook_mutation_test.go @@ -50,6 +50,7 @@ func TestElfMachineMutation(t *testing.T) { elfMachine := fake.NewElfMachine(nil) elfMachine.Annotations = nil + elfMachine.Spec.NumCoresPerSocket = 0 raw, err := marshal(elfMachine) g.Expect(err).NotTo(HaveOccurred()) tests = append(tests, testCase{ @@ -62,6 +63,7 @@ func TestElfMachineMutation(t *testing.T) { expectRespAllowed: true, expectPatchs: []jsonpatch.Operation{ {Operation: "add", Path: "/metadata/annotations", Value: map[string]interface{}{infrav1.CAPEVersionAnnotation: version.CAPEVersion()}}, + {Operation: "add", Path: "/spec/numCoresPerSocket", Value: float64(elfMachine.Spec.NumCPUs)}, }, }) @@ -72,7 +74,7 @@ func TestElfMachineMutation(t *testing.T) { resp := mutation.Handle(context.Background(), tc.admissionRequest) g.Expect(resp.Allowed).Should(Equal(tc.expectRespAllowed)) - g.Expect(resp.Patches).Should(Equal(tc.expectPatchs)) + g.Expect(resp.Patches).Should(ContainElements(tc.expectPatchs)) }) } } diff --git a/webhooks/elfmachine_webhook_validation.go b/webhooks/elfmachine_webhook_validation.go index 959089da..3e953bf6 100644 --- a/webhooks/elfmachine_webhook_validation.go +++ b/webhooks/elfmachine_webhook_validation.go @@ -74,7 +74,7 @@ func (v *ElfMachineValidator) ValidateUpdate(ctx goctx.Context, oldObj, newObj r elfMachineTemplateName := annotationsutil.GetTemplateClonedFromName(elfMachine) if elfMachineTemplateName == "" { if elfMachine.Spec.DiskGiB < oldElfMachine.Spec.DiskGiB { - allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "diskGiB"), elfMachine.Spec.DiskGiB, diskCapacityCanOnlyBeExpanded)) + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "diskGiB"), elfMachine.Spec.DiskGiB, diskCapacityCanOnlyBeExpandedMsg)) } return nil, aggregateObjErrors(elfMachine.GroupVersionKind().GroupKind(), elfMachine.Name, allErrs) @@ -94,6 +94,12 @@ func (v *ElfMachineValidator) ValidateUpdate(ctx goctx.Context, oldObj, newObj r if elfMachine.Spec.DiskGiB != elfMachineTemplate.Spec.Template.Spec.DiskGiB { allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "diskGiB"), elfMachine.Spec.DiskGiB, fmt.Sprintf(canOnlyModifiedThroughElfMachineTemplate, elfMachineTemplateName))) } + if elfMachine.Spec.MemoryMiB != elfMachineTemplate.Spec.Template.Spec.MemoryMiB { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "memoryMiB"), elfMachine.Spec.MemoryMiB, fmt.Sprintf(canOnlyModifiedThroughElfMachineTemplate, elfMachineTemplateName))) + } + if elfMachine.Spec.NumCPUs != elfMachineTemplate.Spec.Template.Spec.NumCPUs { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "numCPUs"), elfMachine.Spec.NumCPUs, fmt.Sprintf(canOnlyModifiedThroughElfMachineTemplate, elfMachineTemplateName))) + } return nil, aggregateObjErrors(elfMachine.GroupVersionKind().GroupKind(), elfMachine.Name, allErrs) } diff --git a/webhooks/elfmachine_webhook_validation_test.go b/webhooks/elfmachine_webhook_validation_test.go index 9ffa3217..8097a4fd 100644 --- a/webhooks/elfmachine_webhook_validation_test.go +++ b/webhooks/elfmachine_webhook_validation_test.go @@ -63,7 +63,7 @@ func TestElfMachineValidatorValidateUpdate(t *testing.T) { }, }, Errs: field.ErrorList{ - field.Invalid(field.NewPath("spec", "diskGiB"), 1, diskCapacityCanOnlyBeExpanded), + field.Invalid(field.NewPath("spec", "diskGiB"), 1, diskCapacityCanOnlyBeExpandedMsg), }, }) diff --git a/webhooks/elfmachinetemplate_webhook_mutation.go b/webhooks/elfmachinetemplate_webhook_mutation.go index bf1cb0e8..161341a8 100644 --- a/webhooks/elfmachinetemplate_webhook_mutation.go +++ b/webhooks/elfmachinetemplate_webhook_mutation.go @@ -63,6 +63,10 @@ func (m *ElfMachineTemplateMutation) Handle(ctx goctx.Context, request admission return admission.Errored(http.StatusBadRequest, err) } + if elfMachineTemplate.Spec.Template.Spec.NumCoresPerSocket <= 0 { + elfMachineTemplate.Spec.Template.Spec.NumCoresPerSocket = elfMachineTemplate.Spec.Template.Spec.NumCPUs + } + devices := elfMachineTemplate.Spec.Template.Spec.Network.Devices for i := 0; i < len(devices); i++ { for j := 0; j < len(devices[i].AddressesFromPools); j++ { diff --git a/webhooks/elfmachinetemplate_webhook_mutation_test.go b/webhooks/elfmachinetemplate_webhook_mutation_test.go index 64531f14..fda4bf91 100644 --- a/webhooks/elfmachinetemplate_webhook_mutation_test.go +++ b/webhooks/elfmachinetemplate_webhook_mutation_test.go @@ -42,7 +42,7 @@ func TestElfMachineMutationTemplate(t *testing.T) { }, Spec: infrav1.ElfMachineTemplateSpec{ Template: infrav1.ElfMachineTemplateResource{ - Spec: infrav1.ElfMachineSpec{}, + Spec: infrav1.ElfMachineSpec{NumCoresPerSocket: 1}, }, }, } @@ -71,6 +71,24 @@ func TestElfMachineMutationTemplate(t *testing.T) { }, }) + elfMachineTemplate.Spec.Template.Spec.Network.Devices = nil + elfMachineTemplate.Spec.Template.Spec.NumCPUs = 1 + elfMachineTemplate.Spec.Template.Spec.NumCoresPerSocket = 0 + raw, err = marshal(elfMachineTemplate) + g.Expect(err).NotTo(HaveOccurred()) + tests = append(tests, testCase{ + name: "should set default values for numCoresPerSocket", + admissionRequest: admission.Request{AdmissionRequest: admissionv1.AdmissionRequest{ + Kind: metav1.GroupVersionKind{Group: infrav1.GroupVersion.Group, Version: infrav1.GroupVersion.Version, Kind: "ElfMachine"}, + Operation: admissionv1.Create, + Object: runtime.RawExtension{Raw: raw}, + }}, + expectRespAllowed: true, + expectPatchs: []jsonpatch.Operation{ + {Operation: "add", Path: "/spec/template/spec/numCoresPerSocket", Value: float64(elfMachineTemplate.Spec.Template.Spec.NumCPUs)}, + }, + }) + for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { mutation := ElfMachineTemplateMutation{} diff --git a/webhooks/elfmachinetemplate_webhook_validation.go b/webhooks/elfmachinetemplate_webhook_validation.go index 2597b2ac..7bbfaf83 100644 --- a/webhooks/elfmachinetemplate_webhook_validation.go +++ b/webhooks/elfmachinetemplate_webhook_validation.go @@ -33,7 +33,12 @@ import ( // Error messages. const ( diskCapacityCannotLessThanZeroMsg = "the disk capacity can only greater than or equal to 0" - diskCapacityCanOnlyBeExpanded = "the disk capacity can only be expanded" + diskCapacityCanOnlyBeExpandedMsg = "the disk capacity can only be expanded" + + memoryCannotLessThanZeroMsg = "the memory can only greater than 0" + + numCPUsCannotLessThanZeroMsg = "the umCPUs can only greater than 0" + numCoresPerSocketCannotLessThanZeroMsg = "the numCoresPerSocket can only greater than 0" ) func (v *ElfMachineTemplateValidator) SetupWebhookWithManager(mgr ctrl.Manager) error { @@ -62,6 +67,18 @@ func (v *ElfMachineTemplateValidator) ValidateCreate(ctx goctx.Context, obj runt allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "diskGiB"), elfMachineTemplate.Spec.Template.Spec.DiskGiB, diskCapacityCannotLessThanZeroMsg)) } + if elfMachineTemplate.Spec.Template.Spec.MemoryMiB <= 0 { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "memoryMiB"), elfMachineTemplate.Spec.Template.Spec.MemoryMiB, memoryCannotLessThanZeroMsg)) + } + + if elfMachineTemplate.Spec.Template.Spec.NumCPUs <= 0 { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "numCPUs"), elfMachineTemplate.Spec.Template.Spec.NumCPUs, numCPUsCannotLessThanZeroMsg)) + } + + if elfMachineTemplate.Spec.Template.Spec.NumCoresPerSocket <= 0 { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "numCoresPerSocket"), elfMachineTemplate.Spec.Template.Spec.NumCoresPerSocket, numCoresPerSocketCannotLessThanZeroMsg)) + } + return nil, aggregateObjErrors(elfMachineTemplate.GroupVersionKind().GroupKind(), elfMachineTemplate.Name, allErrs) } @@ -78,7 +95,19 @@ func (v *ElfMachineTemplateValidator) ValidateUpdate(ctx goctx.Context, oldObj, var allErrs field.ErrorList if elfMachineTemplate.Spec.Template.Spec.DiskGiB < oldElfMachineTemplate.Spec.Template.Spec.DiskGiB { - allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "diskGiB"), elfMachineTemplate.Spec.Template.Spec.DiskGiB, diskCapacityCanOnlyBeExpanded)) + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "diskGiB"), elfMachineTemplate.Spec.Template.Spec.DiskGiB, diskCapacityCanOnlyBeExpandedMsg)) + } + + if elfMachineTemplate.Spec.Template.Spec.MemoryMiB <= 0 { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "memoryMiB"), elfMachineTemplate.Spec.Template.Spec.MemoryMiB, memoryCannotLessThanZeroMsg)) + } + + if elfMachineTemplate.Spec.Template.Spec.NumCPUs <= 0 { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "numCPUs"), elfMachineTemplate.Spec.Template.Spec.NumCPUs, numCPUsCannotLessThanZeroMsg)) + } + + if elfMachineTemplate.Spec.Template.Spec.NumCoresPerSocket <= 0 { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "numCoresPerSocket"), elfMachineTemplate.Spec.Template.Spec.NumCoresPerSocket, numCoresPerSocketCannotLessThanZeroMsg)) } return nil, aggregateObjErrors(elfMachineTemplate.GroupVersionKind().GroupKind(), elfMachineTemplate.Name, allErrs) diff --git a/webhooks/elfmachinetemplate_webhook_validation_test.go b/webhooks/elfmachinetemplate_webhook_validation_test.go index 1ae4eb99..ab69555f 100644 --- a/webhooks/elfmachinetemplate_webhook_validation_test.go +++ b/webhooks/elfmachinetemplate_webhook_validation_test.go @@ -37,7 +37,10 @@ func TestElfMachineTemplateValidatorValidateCreate(t *testing.T) { EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ Template: infrav1.ElfMachineTemplateResource{ Spec: infrav1.ElfMachineSpec{ - DiskGiB: -1, + NumCPUs: 1, + NumCoresPerSocket: 1, + MemoryMiB: 1, + DiskGiB: -1, }, }, }}, @@ -49,7 +52,10 @@ func TestElfMachineTemplateValidatorValidateCreate(t *testing.T) { EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ Template: infrav1.ElfMachineTemplateResource{ Spec: infrav1.ElfMachineSpec{ - DiskGiB: 0, + NumCPUs: 1, + NumCoresPerSocket: 1, + MemoryMiB: 1, + DiskGiB: 0, }, }, }}, @@ -59,11 +65,59 @@ func TestElfMachineTemplateValidatorValidateCreate(t *testing.T) { EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ Template: infrav1.ElfMachineTemplateResource{ Spec: infrav1.ElfMachineSpec{ - DiskGiB: 100, + NumCPUs: 1, + NumCoresPerSocket: 1, + MemoryMiB: 1, + DiskGiB: 100, }, }, }}, Errs: nil, + }, testCaseEMT{ + Name: "memory cannot be less than 0", + EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ + Template: infrav1.ElfMachineTemplateResource{ + Spec: infrav1.ElfMachineSpec{ + NumCPUs: 1, + NumCoresPerSocket: 1, + DiskGiB: 100, + MemoryMiB: 0, + }, + }, + }}, + Errs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "template", "spec", "memoryMiB"), 0, memoryCannotLessThanZeroMsg), + }, + }, testCaseEMT{ + Name: "numCPUs cannot be less than 0", + EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ + Template: infrav1.ElfMachineTemplateResource{ + Spec: infrav1.ElfMachineSpec{ + NumCoresPerSocket: 1, + DiskGiB: 100, + MemoryMiB: 1, + NumCPUs: 0, + }, + }, + }}, + Errs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "template", "spec", "numCPUs"), 0, numCPUsCannotLessThanZeroMsg), + }, + }, testCaseEMT{ + Name: "numCoresPerSocket cannot be less than 0", + EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ + Template: infrav1.ElfMachineTemplateResource{ + Spec: infrav1.ElfMachineSpec{ + NumCPUs: 1, + DiskGiB: 100, + MemoryMiB: 1, + NumCoresPerSocket: 0, + }, + }, + }}, + Errs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "template", "spec", "numCoresPerSocket"), 0, numCoresPerSocketCannotLessThanZeroMsg), + }, }) validator := &ElfMachineTemplateValidator{} @@ -86,19 +140,85 @@ func TestElfMachineTemplateValidatorValidateUpdate(t *testing.T) { OldEMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ Template: infrav1.ElfMachineTemplateResource{ Spec: infrav1.ElfMachineSpec{ - DiskGiB: 2, + NumCPUs: 1, + NumCoresPerSocket: 1, + MemoryMiB: 1, + DiskGiB: 2, + }, + }, + }}, + EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ + Template: infrav1.ElfMachineTemplateResource{ + Spec: infrav1.ElfMachineSpec{ + NumCPUs: 1, + NumCoresPerSocket: 1, + MemoryMiB: 1, + DiskGiB: 1, + }, + }, + }}, + Errs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "template", "spec", "diskGiB"), 1, diskCapacityCanOnlyBeExpandedMsg), + }, + }, testCaseEMT{ + Name: "memory cannot be less than 0", + OldEMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ + Template: infrav1.ElfMachineTemplateResource{ + Spec: infrav1.ElfMachineSpec{}, + }, + }}, + EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ + Template: infrav1.ElfMachineTemplateResource{ + Spec: infrav1.ElfMachineSpec{ + NumCPUs: 1, + NumCoresPerSocket: 1, + DiskGiB: 1, + MemoryMiB: 0, }, }, }}, + Errs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "template", "spec", "memoryMiB"), 0, memoryCannotLessThanZeroMsg), + }, + }, testCaseEMT{ + Name: "numCPUs cannot be less than 0", + OldEMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ + Template: infrav1.ElfMachineTemplateResource{ + Spec: infrav1.ElfMachineSpec{}, + }, + }}, + EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ + Template: infrav1.ElfMachineTemplateResource{ + Spec: infrav1.ElfMachineSpec{ + NumCoresPerSocket: 1, + DiskGiB: 1, + MemoryMiB: 1, + NumCPUs: 0, + }, + }, + }}, + Errs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "template", "spec", "numCPUs"), 0, numCPUsCannotLessThanZeroMsg), + }, + }, testCaseEMT{ + Name: "numCoresPerSocket cannot be less than 0", + OldEMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ + Template: infrav1.ElfMachineTemplateResource{ + Spec: infrav1.ElfMachineSpec{}, + }, + }}, EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ Template: infrav1.ElfMachineTemplateResource{ Spec: infrav1.ElfMachineSpec{ - DiskGiB: 1, + DiskGiB: 1, + MemoryMiB: 1, + NumCPUs: 1, + NumCoresPerSocket: 0, }, }, }}, Errs: field.ErrorList{ - field.Invalid(field.NewPath("spec", "template", "spec", "diskGiB"), 1, diskCapacityCanOnlyBeExpanded), + field.Invalid(field.NewPath("spec", "template", "spec", "numCoresPerSocket"), 0, numCoresPerSocketCannotLessThanZeroMsg), }, })