From 93292b183bd8d250d91c46c6ab074d49464ec961 Mon Sep 17 00:00:00 2001 From: Oriol Vilarrubi Date: Mon, 8 Jan 2024 13:21:05 +0100 Subject: [PATCH] TPU - Add reserved property for nodeset_tpu --- CHANGELOG.md | 4 ++++ scripts/util.py | 5 +++++ .../modules/slurm_nodeset_tpu/README_TF.md | 1 + .../slurm_cluster/modules/slurm_nodeset_tpu/main.tf | 4 ++++ .../modules/slurm_nodeset_tpu/variables.tf | 6 ++++++ .../modules/slurm_nodeset_tpu/versions.tf | 10 ++++++++-- 6 files changed, 28 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 24c0039d7e..5765ebafc0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ All notable changes to this project will be documented in this file. +## \[6.3.1\] + +- Add reserved property for nodeset_tpu + ## \[6.3.0\] - Upgrade installed Slurm to 23.02.7 diff --git a/scripts/util.py b/scripts/util.py index fe8fb7428b..0fe9fef6be 100755 --- a/scripts/util.py +++ b/scripts/util.py @@ -1148,6 +1148,10 @@ def enable_public_ip(self): def preemptible(self): return self._nodeset.preemptible + @property + def reserved(self): + return self._nodeset.reserved + @property def service_account(self): return self._nodeset.service_account @@ -1277,6 +1281,7 @@ def create_node(self, nodename): node.service_account.email = self.nodeset.service_account.email node.service_account.scope = self.nodeset.service_account.scopes node.scheduling_config.preemptible = self.preemptible + node.scheduling_config.reserved = self.reserved if self.nodeset.network: node.network_config.network = self.nodeset.network if self.nodeset.subnetwork: diff --git a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/README_TF.md b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/README_TF.md index 1d9157573f..bad238161a 100644 --- a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/README_TF.md +++ b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/README_TF.md @@ -59,6 +59,7 @@ No modules. | [preemptible](#input\_preemptible) | Specify whether TPU-vms in this nodeset are preemtible, see https://cloud.google.com/tpu/docs/preemptible for details. | `bool` | `false` | no | | [preserve\_tpu](#input\_preserve\_tpu) | Specify whether TPU-vms will get preserve on suspend, if set to true, on suspend vm is stopped, on false it gets deleted | `bool` | `true` | no | | [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | +| [reserved](#input\_reserved) | Specify whether TPU-vms in this nodeset are created under a reservation. | `bool` | `false` | no | | [service\_account](#input\_service\_account) | Service account to attach to the TPU-vm.
If none is given, the default service account and scopes will be used. |
object({
email = string
scopes = set(string)
})
| `null` | no | | [subnetwork](#input\_subnetwork) | The name of the subnetwork to attach the TPU-vm of this nodeset to. | `string` | `null` | no | | [tf\_version](#input\_tf\_version) | Nodeset Tensorflow version, see https://cloud.google.com/tpu/docs/supported-tpu-configurations#tpu_vm for details. | `string` | n/a | yes | diff --git a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/main.tf b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/main.tf index fbb8988577..8ac7344536 100644 --- a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/main.tf +++ b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/main.tf @@ -111,6 +111,10 @@ resource "null_resource" "nodeset_tpu" { condition = sum([var.node_count_dynamic_max, var.node_count_static]) > 0 error_message = "Sum of node_count_dynamic_max and node_count_static must be > 0." } + precondition { + condition = !(var.preemptible && var.reserved) + error_message = "Nodeset cannot be preemptible and reserved at the same time." + } precondition { condition = !(var.subnetwork == null && !var.enable_public_ip) error_message = "Using the default subnetwork for the TPU nodeset requires enable_public_ip set to true." diff --git a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/variables.tf b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/variables.tf index 1b6c270cd8..de51f7a970 100644 --- a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/variables.tf +++ b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/variables.tf @@ -77,6 +77,12 @@ variable "preemptible" { default = false } +variable "reserved" { + description = "Specify whether TPU-vms in this nodeset are created under a reservation." + type = bool + default = false +} + variable "preserve_tpu" { description = "Specify whether TPU-vms will get preserve on suspend, if set to true, on suspend vm is stopped, on false it gets deleted" type = bool diff --git a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/versions.tf b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/versions.tf index a4f0cbde02..cc7222cae2 100644 --- a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/versions.tf +++ b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/versions.tf @@ -18,7 +18,13 @@ terraform { required_version = "~> 1.2" required_providers { - google = ">= 3.53, < 5.0" - null = "~> 3.0" + google = { + source = "hashicorp/google" + version = ">= 3.53, < 5.0" + } + null = { + source = "hashicorp/null" + version = "~> 3.0" + } } }