diff --git a/benchmarks/benchmark/tools/dlio/README.md b/benchmarks/benchmark/tools/dlio/README.md index 1453c879e..99dd87b75 100644 --- a/benchmarks/benchmark/tools/dlio/README.md +++ b/benchmarks/benchmark/tools/dlio/README.md @@ -16,12 +16,21 @@ If you need to reinstall any resources, make sure to delete this file as well. ## Run DLIO Job 1. Update the `variables.tf` file with your desired settings to run your machine learning benchmark workload -2. Change the dlio image in `dlio/podspec.tpl` +2. Change the dlio image in `modules/dlio/podspec.tpl` 3. Run `terraform init` 4. Run `terraform apply` 5. After you finish your test, run `terraform destroy` to delete the resources +## Run DLIO Job with Parallelstore +Pre-reqs: right now you'll need to manually setup the VPC peering from the GKE cluster's network to `servicenetworking.googleapis.com`. + +1. update `variables.tf` file with your desired settings to run your machine learning benchmark workload, notably set `gcs_fuse_csi_driver_enabled` to `false` and `paralllestore_csi_driver_enabled` to `true`. If you want to use static provisioning, update the "parallelstore variables" and `parallelstore_storageclass` to `""`. +2. Change the dlio image in `dlio/podspec.tpl` to a desired version. We have tested the job with dlio v0.5.1. +3. run `terraform init` +4. run `terraform apply -target=module.ps_storage` +5. run `terraform apply` after the dataloader job is completed; pvc patch failure is OK for dynamic provisioning. + ## Check Test Result The test result reports are located in the `${dlio_benchmark_result}` directory. For example, if you use a GCS bucket to store the training dataset, the GCS bucket will be mounted at diff --git a/benchmarks/benchmark/tools/dlio/main.tf b/benchmarks/benchmark/tools/dlio/main.tf index 824b2228d..20e8027b9 100644 --- a/benchmarks/benchmark/tools/dlio/main.tf +++ b/benchmarks/benchmark/tools/dlio/main.tf @@ -31,6 +31,26 @@ module "gcs_pv_pvc" { gcs_bucket = var.gcs_bucket } +module "ps_storage" { + source = "./modules/parallelstore_storage" + count = var.paralllestore_csi_driver_enabled == "\"true\"" ? 1 : 0 + + pv_name = var.pv_name + pvc_name = var.pvc_name + gcs_bucket = var.gcs_bucket + ps_instance_name = var.parallelstore_instance_name + ps_ip_address_1 = var.parallelstore_ip_address_1 + ps_ip_address_2 = var.parallelstore_ip_address_2 + ps_ip_address_3 = var.parallelstore_ip_address_3 + ps_network_name = var.parallelstore_network_name + location = var.parallelstore_location + storageclass = var.parallelstore_storageclass + project = var.parallelstore_project + k8s_service_account = var.k8s_service_account + run_parallelstore_data_loader = var.run_parallelstore_data_loader + namespace = var.namespace +} + module "dlio" { source = "./modules/dlio" @@ -43,6 +63,9 @@ module "dlio" { gcs_fuse_sidecar_cpu_limit = var.gcs_fuse_sidecar_cpu_limit gcs_fuse_sidecar_memory_limit = var.gcs_fuse_sidecar_memory_limit gcs_fuse_sidecar_ephemeral_storage_limit = var.gcs_fuse_sidecar_ephemeral_storage_limit + pscsi_driver_enabled = var.paralllestore_csi_driver_enabled + pscsi_sidecar_cpu_limit = var.pscsi_sidecar_cpu_limit + pscsi_sidecar_memory_limit = var.pscsi_sidecar_memory_limit dlio_container_cpu_limit = var.dlio_container_cpu_limit dlio_container_memory_limit = var.dlio_container_memory_limit dlio_container_ephemeral_storage = var.dlio_container_ephemeral_storage diff --git a/benchmarks/benchmark/tools/dlio/modules/dlio/job.tf b/benchmarks/benchmark/tools/dlio/modules/dlio/job.tf index 96691b5e1..be3019eed 100644 --- a/benchmarks/benchmark/tools/dlio/modules/dlio/job.tf +++ b/benchmarks/benchmark/tools/dlio/modules/dlio/job.tf @@ -22,6 +22,9 @@ resource "local_file" "podspec" { gcs_fuse_sidecar_cpu_limit = "${var.gcs_fuse_sidecar_cpu_limit}" gcs_fuse_sidecar_memory_limit = "${var.gcs_fuse_sidecar_memory_limit}" gcs_fuse_sidecar_ephemeral_storage_limit = "${var.gcs_fuse_sidecar_ephemeral_storage_limit}" + pscsi_driver_enabled = "${var.pscsi_driver_enabled}" + pscsi_sidecar_cpu_limit = "${var.pscsi_sidecar_cpu_limit}" + pscsi_sidecar_memory_limit = "${var.pscsi_sidecar_memory_limit}" dlio_container_cpu_limit = "${var.dlio_container_cpu_limit}" dlio_container_memory_limit = "${var.dlio_container_memory_limit}" dlio_container_ephemeral_storage = "${var.dlio_container_ephemeral_storage}" @@ -48,4 +51,4 @@ resource "local_file" "podspec" { resource "kubectl_manifest" "podspec" { yaml_body = resource.local_file.podspec.content -} \ No newline at end of file +} diff --git a/benchmarks/benchmark/tools/dlio/modules/dlio/podspec.tpl b/benchmarks/benchmark/tools/dlio/modules/dlio/podspec.tpl index 43390b53e..dd58b988a 100644 --- a/benchmarks/benchmark/tools/dlio/modules/dlio/podspec.tpl +++ b/benchmarks/benchmark/tools/dlio/modules/dlio/podspec.tpl @@ -16,6 +16,9 @@ spec: gke-gcsfuse/cpu-limit: ${gcs_fuse_sidecar_cpu_limit} gke-gcsfuse/memory-limit: ${gcs_fuse_sidecar_memory_limit} gke-gcsfuse/ephemeral-storage-limit: ${gcs_fuse_sidecar_ephemeral_storage_limit} + gke-parallelstore/volumes: ${pscsi_driver_enabled} + gke-parallelstore/cpu-limit: ${pscsi_sidecar_cpu_limit} + gke-parallelstore/memory-limit: ${pscsi_sidecar_memory_limit} spec: containers: - name: dlio diff --git a/benchmarks/benchmark/tools/dlio/modules/dlio/variables.tf b/benchmarks/benchmark/tools/dlio/modules/dlio/variables.tf index 3564d5098..e5f4c5979 100644 --- a/benchmarks/benchmark/tools/dlio/modules/dlio/variables.tf +++ b/benchmarks/benchmark/tools/dlio/modules/dlio/variables.tf @@ -68,6 +68,18 @@ variable "gcs_fuse_sidecar_ephemeral_storage_limit" { description = "The maximum amount of Ephemeral Storage resource that the sidecar container can use" } +variable "pscsi_driver_enabled" { + type = string +} + +variable "pscsi_sidecar_cpu_limit" { + type = string +} + +variable "pscsi_sidecar_memory_limit" { + type = string +} + variable "dlio_container_cpu_limit" { type = number description = "The maximum amount of CPU resource that the DLIO benchmark workload container can use" diff --git a/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/dataloader_job.tpl b/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/dataloader_job.tpl new file mode 100644 index 000000000..4a76bf1ad --- /dev/null +++ b/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/dataloader_job.tpl @@ -0,0 +1,41 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: data-loader-job + namespace: ${namespace} +spec: + backoffLimit: 0 + template: + metadata: + name: data-loader-job + annotations: + gke-parallelstore/volumes: "true" + gke-parallelstore/cpu-limit: 5 + gke-parallelstore/memory-limit: 5Gi + spec: + restartPolicy: Never + containers: + - name: data-loader + image: google/cloud-sdk:latest + env: + - name: BUCKET_NAME + value: ${gcs_bucket} + command: + - "/bin/sh" + - "-c" + - gsutil -m cp -R gs://$BUCKET_NAME/* /disk/; + resources: + limits: + cpu: "10" + memory: 20Gi + requests: + cpu: "10" + memory: 20Gi + volumeMounts: + - name: ml-perf-volume + mountPath: /disk + serviceAccountName: ${service_account} + volumes: + - name: ml-perf-volume + persistentVolumeClaim: + claimName: ${pvc_name} diff --git a/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/ps_pv.tpl b/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/ps_pv.tpl new file mode 100644 index 000000000..cf989c118 --- /dev/null +++ b/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/ps_pv.tpl @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: ${pv_name} +spec: + storageClassName: "" + capacity: + storage: 12Ti + accessModes: + - ReadWriteMany + persistentVolumeReclaimPolicy: Retain + volumeMode: Filesystem + csi: + driver: parallelstore.csi.storage.gke.io + volumeHandle: ${project}/${ps_location}/${ps_instance_name}/default-pool/default-container + volumeAttributes: + ip: "${ps_ip_address_1}, ${ps_ip_address_2}, ${ps_ip_address_3}" + network: ${ps_network_name} \ No newline at end of file diff --git a/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/ps_pv_pvc.tf b/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/ps_pv_pvc.tf new file mode 100644 index 000000000..2c7acb8b0 --- /dev/null +++ b/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/ps_pv_pvc.tf @@ -0,0 +1,62 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +resource "local_file" "ps_pv" { + content = templatefile("${path.module}/ps_pv.tpl", { + pv_name = "${var.pv_name}" + pvc_name = "${var.pvc_name}" + project = "${var.project}" + ps_location = "${var.location}" + ps_instance_name = "${var.ps_instance_name}" + ps_ip_address_1 = "${var.ps_ip_address_1}" + ps_ip_address_2 = "${var.ps_ip_address_2}" + ps_ip_address_3 = "${var.ps_ip_address_3}" + ps_network_name = "${var.ps_network_name}" + }) + filename = "${path.module}/pv-spec-rendered.yaml" +} + +resource "kubectl_manifest" "ps_pv" { + count = var.storageclass == "" ? 1 : 0 + yaml_body = resource.local_file.ps_pv.content +} + +resource "local_file" "ps_pvc" { + content = templatefile("${path.module}/ps_pvc.tpl", { + pv_name = var.storageclass == "" ? "${var.pv_name}" : "" + namespace = "${var.namespace}" + pvc_name = "${var.pvc_name}" + storageclass = "${var.storageclass}" + }) + filename = "${path.module}/pvc-spec-rendered.yaml" +} + +resource "kubectl_manifest" "ps_pvc" { + yaml_body = resource.local_file.ps_pvc.content +} + +resource "local_file" "dataloader" { + content = templatefile("${path.module}/dataloader_job.tpl", { + namespace = "${var.namespace}" + pvc_name = "${var.pvc_name}" + gcs_bucket = "${var.gcs_bucket}" + service_account = "${var.k8s_service_account}" + }) + filename = "${path.module}/dataloader-job-rendered.yaml" +} + +resource "kubectl_manifest" "dataloader" { + count = var.run_parallelstore_data_loader == "\"true\"" ? 1 : 0 + yaml_body = resource.local_file.dataloader.content +} diff --git a/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/ps_pvc.tpl b/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/ps_pvc.tpl new file mode 100644 index 000000000..93e63feb2 --- /dev/null +++ b/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/ps_pvc.tpl @@ -0,0 +1,13 @@ +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: ${pvc_name} + namespace: ${namespace} +spec: + accessModes: + - ReadWriteMany + storageClassName: ${storageclass} + volumeName: ${pv_name} + resources: + requests: + storage: 12000Gi \ No newline at end of file diff --git a/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/variables.tf b/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/variables.tf new file mode 100644 index 000000000..1dcdbe5d4 --- /dev/null +++ b/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/variables.tf @@ -0,0 +1,76 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "gcs_bucket" { + type = string + description = "GCS Bucket name" +} + +// pv, pvc +variable "pv_name" { + type = string + description = "Name of the PersistentVolume used for DLIO dataset" +} + +variable "pvc_name" { + type = string + description = "Name of the PersistentVolumeClaim used for DLIO dataset" +} + +variable "storageclass" { + type = string + description = "Name of the storageclass" +} + +variable "project" { + type = string + description = "The project name in which the Parallelstore instance is provisioned" +} + +variable "location" { + type = string +} + +variable "ps_instance_name" { + type = string +} + +variable "ps_ip_address_1" { + type = string +} + +variable "ps_ip_address_2" { + type = string +} + +variable "ps_ip_address_3" { + type = string +} + +variable "ps_network_name" { + type = string +} + +variable "k8s_service_account" { + type = string + description = "Kubernetes service account name as in the Configure access to Cloud Storage buckets using GKE Workload Identity step" +} + +variable "run_parallelstore_data_loader" { + type = string +} + +variable "namespace" { + type = string +} diff --git a/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/versions.tf b/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/versions.tf new file mode 100644 index 000000000..7940faa05 --- /dev/null +++ b/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/versions.tf @@ -0,0 +1,30 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + helm = { + source = "hashicorp/helm" + version = "~> 2.8.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "2.18.1" + } + kubectl = { + source = "alekc/kubectl" + version = "2.0.1" + } + } +} diff --git a/benchmarks/benchmark/tools/dlio/variables.tf b/benchmarks/benchmark/tools/dlio/variables.tf index 6b2e7f73c..0a304a3d6 100644 --- a/benchmarks/benchmark/tools/dlio/variables.tf +++ b/benchmarks/benchmark/tools/dlio/variables.tf @@ -30,6 +30,20 @@ variable "gcs_bucket" { default = "" } +// at most one of the below trigers can be set to true +variable "gcs_fuse_csi_driver_enabled" { + type = string + description = "Set to true if running DLIO on GCSFuse and the Cloud Storage FUSE CSI driver is enabled on your cluster" + default = "\"true\"" +} + +variable "paralllestore_csi_driver_enabled" { + type = string + description = "Set to true if running DLIO on Parallelstore and the Parallelstore CSI driver is enabled on your cluster" + default = "\"false\"" +} +// at most one of the above triggeres can be set to true + // DLIO Job configurations variable "job_backoffLimit" { type = number @@ -49,12 +63,6 @@ variable "job_parallelism" { default = 1 } -variable "gcs_fuse_csi_driver_enabled" { - type = string - description = "Set to true if the Cloud Storage FUSE CSI driver is enabled on your cluster" - default = "\"true\"" -} - variable "gcs_fuse_sidecar_cpu_limit" { type = string description = "The maximum amount of CPU resource that the sidecar container can use" @@ -73,6 +81,18 @@ variable "gcs_fuse_sidecar_ephemeral_storage_limit" { default = "\"100Gi\"" } +variable "pscsi_sidecar_cpu_limit" { + type = string + description = "The maximum amount of CPU resource that the sidecar container can use" + default = "\"20\"" +} + +variable "pscsi_sidecar_memory_limit" { + type = string + description = "The maximum amount of Memory resource that the sidecar container can use" + default = "\"20Gi\"" +} + variable "dlio_container_cpu_limit" { type = number description = "The maximum amount of CPU resource that the DLIO benchmark workload container can use" @@ -208,4 +228,60 @@ variable "gcsfuse_type_cache_ttl" { type = string description = "Specifies how long Cloud Storage FUSE caches the mapping of objects in Cloud Storage to their corresponding type, such as files or directories" default = "120m0s" -} \ No newline at end of file +} + +// parallelstore variables +variable "run_parallelstore_data_loader" { + type = string + description = "Set to true if running the dataloader for parallelstore" + default = "\"true\"" +} + +variable "parallelstore_instance_name" { + type = string + description = "instance name of parallelstore" + default = "" +} + +// The IPs are listed as "accessPoints" in the result of instance describe command +variable "parallelstore_ip_address_1" { + type = string + description = "ip address of the parallelstore instance's accessPoints" + default = "" +} + +variable "parallelstore_ip_address_2" { + type = string + description = "ip address of the parallelstore instance's accessPoints" + default = "" +} + +variable "parallelstore_ip_address_3" { + type = string + description = "ip address of the parallelstore instance's accessPoints" + default = "" +} + +variable "parallelstore_network_name" { + type = string + description = "network name of the parallelstore instance" + default = "" +} + +variable "parallelstore_location" { + type = string + description = "location of the parallelstore instance, e.g. us-central1-a" + default = "" +} + +variable "parallelstore_storageclass" { + type = string + description = "the storage class used for dynamic provisioning. if using static provisioning, set it to nil" + default = "parallelstore-rwx" +} + +variable "parallelstore_project" { + type = string + description = "the project name of the parallelstore instance" + default = "" +}