Skip to content

Commit

Permalink
add support for PS dlio and PS dataloader (GoogleCloudPlatform#256)
Browse files Browse the repository at this point in the history
  • Loading branch information
leiyiz authored Mar 6, 2024
1 parent f8b6687 commit 8444078
Show file tree
Hide file tree
Showing 12 changed files with 375 additions and 9 deletions.
11 changes: 10 additions & 1 deletion benchmarks/benchmark/tools/dlio/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,21 @@ If you need to reinstall any resources, make sure to delete this file as well.

## Run DLIO Job
1. Update the `variables.tf` file with your desired settings to run your machine learning benchmark workload
2. Change the dlio image in `dlio/podspec.tpl`
2. Change the dlio image in `modules/dlio/podspec.tpl`
3. Run `terraform init`
4. Run `terraform apply`
5. After you finish your test, run `terraform destroy` to delete the
resources

## Run DLIO Job with Parallelstore
Pre-reqs: right now you'll need to manually setup the VPC peering from the GKE cluster's network to `servicenetworking.googleapis.com`.

1. update `variables.tf` file with your desired settings to run your machine learning benchmark workload, notably set `gcs_fuse_csi_driver_enabled` to `false` and `paralllestore_csi_driver_enabled` to `true`. If you want to use static provisioning, update the "parallelstore variables" and `parallelstore_storageclass` to `""`.
2. Change the dlio image in `dlio/podspec.tpl` to a desired version. We have tested the job with dlio v0.5.1.
3. run `terraform init`
4. run `terraform apply -target=module.ps_storage`
5. run `terraform apply` after the dataloader job is completed; pvc patch failure is OK for dynamic provisioning.

## Check Test Result
The test result reports are located in the `${dlio_benchmark_result}` directory. For example,
if you use a GCS bucket to store the training dataset, the GCS bucket will be mounted at
Expand Down
23 changes: 23 additions & 0 deletions benchmarks/benchmark/tools/dlio/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,26 @@ module "gcs_pv_pvc" {
gcs_bucket = var.gcs_bucket
}

module "ps_storage" {
source = "./modules/parallelstore_storage"
count = var.paralllestore_csi_driver_enabled == "\"true\"" ? 1 : 0

pv_name = var.pv_name
pvc_name = var.pvc_name
gcs_bucket = var.gcs_bucket
ps_instance_name = var.parallelstore_instance_name
ps_ip_address_1 = var.parallelstore_ip_address_1
ps_ip_address_2 = var.parallelstore_ip_address_2
ps_ip_address_3 = var.parallelstore_ip_address_3
ps_network_name = var.parallelstore_network_name
location = var.parallelstore_location
storageclass = var.parallelstore_storageclass
project = var.parallelstore_project
k8s_service_account = var.k8s_service_account
run_parallelstore_data_loader = var.run_parallelstore_data_loader
namespace = var.namespace
}

module "dlio" {
source = "./modules/dlio"

Expand All @@ -43,6 +63,9 @@ module "dlio" {
gcs_fuse_sidecar_cpu_limit = var.gcs_fuse_sidecar_cpu_limit
gcs_fuse_sidecar_memory_limit = var.gcs_fuse_sidecar_memory_limit
gcs_fuse_sidecar_ephemeral_storage_limit = var.gcs_fuse_sidecar_ephemeral_storage_limit
pscsi_driver_enabled = var.paralllestore_csi_driver_enabled
pscsi_sidecar_cpu_limit = var.pscsi_sidecar_cpu_limit
pscsi_sidecar_memory_limit = var.pscsi_sidecar_memory_limit
dlio_container_cpu_limit = var.dlio_container_cpu_limit
dlio_container_memory_limit = var.dlio_container_memory_limit
dlio_container_ephemeral_storage = var.dlio_container_ephemeral_storage
Expand Down
5 changes: 4 additions & 1 deletion benchmarks/benchmark/tools/dlio/modules/dlio/job.tf
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ resource "local_file" "podspec" {
gcs_fuse_sidecar_cpu_limit = "${var.gcs_fuse_sidecar_cpu_limit}"
gcs_fuse_sidecar_memory_limit = "${var.gcs_fuse_sidecar_memory_limit}"
gcs_fuse_sidecar_ephemeral_storage_limit = "${var.gcs_fuse_sidecar_ephemeral_storage_limit}"
pscsi_driver_enabled = "${var.pscsi_driver_enabled}"
pscsi_sidecar_cpu_limit = "${var.pscsi_sidecar_cpu_limit}"
pscsi_sidecar_memory_limit = "${var.pscsi_sidecar_memory_limit}"
dlio_container_cpu_limit = "${var.dlio_container_cpu_limit}"
dlio_container_memory_limit = "${var.dlio_container_memory_limit}"
dlio_container_ephemeral_storage = "${var.dlio_container_ephemeral_storage}"
Expand All @@ -48,4 +51,4 @@ resource "local_file" "podspec" {

resource "kubectl_manifest" "podspec" {
yaml_body = resource.local_file.podspec.content
}
}
3 changes: 3 additions & 0 deletions benchmarks/benchmark/tools/dlio/modules/dlio/podspec.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ spec:
gke-gcsfuse/cpu-limit: ${gcs_fuse_sidecar_cpu_limit}
gke-gcsfuse/memory-limit: ${gcs_fuse_sidecar_memory_limit}
gke-gcsfuse/ephemeral-storage-limit: ${gcs_fuse_sidecar_ephemeral_storage_limit}
gke-parallelstore/volumes: ${pscsi_driver_enabled}
gke-parallelstore/cpu-limit: ${pscsi_sidecar_cpu_limit}
gke-parallelstore/memory-limit: ${pscsi_sidecar_memory_limit}
spec:
containers:
- name: dlio
Expand Down
12 changes: 12 additions & 0 deletions benchmarks/benchmark/tools/dlio/modules/dlio/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,18 @@ variable "gcs_fuse_sidecar_ephemeral_storage_limit" {
description = "The maximum amount of Ephemeral Storage resource that the sidecar container can use"
}

variable "pscsi_driver_enabled" {
type = string
}

variable "pscsi_sidecar_cpu_limit" {
type = string
}

variable "pscsi_sidecar_memory_limit" {
type = string
}

variable "dlio_container_cpu_limit" {
type = number
description = "The maximum amount of CPU resource that the DLIO benchmark workload container can use"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
apiVersion: batch/v1
kind: Job
metadata:
name: data-loader-job
namespace: ${namespace}
spec:
backoffLimit: 0
template:
metadata:
name: data-loader-job
annotations:
gke-parallelstore/volumes: "true"
gke-parallelstore/cpu-limit: 5
gke-parallelstore/memory-limit: 5Gi
spec:
restartPolicy: Never
containers:
- name: data-loader
image: google/cloud-sdk:latest
env:
- name: BUCKET_NAME
value: ${gcs_bucket}
command:
- "/bin/sh"
- "-c"
- gsutil -m cp -R gs://$BUCKET_NAME/* /disk/;
resources:
limits:
cpu: "10"
memory: 20Gi
requests:
cpu: "10"
memory: 20Gi
volumeMounts:
- name: ml-perf-volume
mountPath: /disk
serviceAccountName: ${service_account}
volumes:
- name: ml-perf-volume
persistentVolumeClaim:
claimName: ${pvc_name}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: v1
kind: PersistentVolume
metadata:
name: ${pv_name}
spec:
storageClassName: ""
capacity:
storage: 12Ti
accessModes:
- ReadWriteMany
persistentVolumeReclaimPolicy: Retain
volumeMode: Filesystem
csi:
driver: parallelstore.csi.storage.gke.io
volumeHandle: ${project}/${ps_location}/${ps_instance_name}/default-pool/default-container
volumeAttributes:
ip: "${ps_ip_address_1}, ${ps_ip_address_2}, ${ps_ip_address_3}"
network: ${ps_network_name}
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

resource "local_file" "ps_pv" {
content = templatefile("${path.module}/ps_pv.tpl", {
pv_name = "${var.pv_name}"
pvc_name = "${var.pvc_name}"
project = "${var.project}"
ps_location = "${var.location}"
ps_instance_name = "${var.ps_instance_name}"
ps_ip_address_1 = "${var.ps_ip_address_1}"
ps_ip_address_2 = "${var.ps_ip_address_2}"
ps_ip_address_3 = "${var.ps_ip_address_3}"
ps_network_name = "${var.ps_network_name}"
})
filename = "${path.module}/pv-spec-rendered.yaml"
}

resource "kubectl_manifest" "ps_pv" {
count = var.storageclass == "" ? 1 : 0
yaml_body = resource.local_file.ps_pv.content
}

resource "local_file" "ps_pvc" {
content = templatefile("${path.module}/ps_pvc.tpl", {
pv_name = var.storageclass == "" ? "${var.pv_name}" : ""
namespace = "${var.namespace}"
pvc_name = "${var.pvc_name}"
storageclass = "${var.storageclass}"
})
filename = "${path.module}/pvc-spec-rendered.yaml"
}

resource "kubectl_manifest" "ps_pvc" {
yaml_body = resource.local_file.ps_pvc.content
}

resource "local_file" "dataloader" {
content = templatefile("${path.module}/dataloader_job.tpl", {
namespace = "${var.namespace}"
pvc_name = "${var.pvc_name}"
gcs_bucket = "${var.gcs_bucket}"
service_account = "${var.k8s_service_account}"
})
filename = "${path.module}/dataloader-job-rendered.yaml"
}

resource "kubectl_manifest" "dataloader" {
count = var.run_parallelstore_data_loader == "\"true\"" ? 1 : 0
yaml_body = resource.local_file.dataloader.content
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: ${pvc_name}
namespace: ${namespace}
spec:
accessModes:
- ReadWriteMany
storageClassName: ${storageclass}
volumeName: ${pv_name}
resources:
requests:
storage: 12000Gi
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

variable "gcs_bucket" {
type = string
description = "GCS Bucket name"
}

// pv, pvc
variable "pv_name" {
type = string
description = "Name of the PersistentVolume used for DLIO dataset"
}

variable "pvc_name" {
type = string
description = "Name of the PersistentVolumeClaim used for DLIO dataset"
}

variable "storageclass" {
type = string
description = "Name of the storageclass"
}

variable "project" {
type = string
description = "The project name in which the Parallelstore instance is provisioned"
}

variable "location" {
type = string
}

variable "ps_instance_name" {
type = string
}

variable "ps_ip_address_1" {
type = string
}

variable "ps_ip_address_2" {
type = string
}

variable "ps_ip_address_3" {
type = string
}

variable "ps_network_name" {
type = string
}

variable "k8s_service_account" {
type = string
description = "Kubernetes service account name as in the Configure access to Cloud Storage buckets using GKE Workload Identity step"
}

variable "run_parallelstore_data_loader" {
type = string
}

variable "namespace" {
type = string
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

terraform {
required_providers {
helm = {
source = "hashicorp/helm"
version = "~> 2.8.0"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "2.18.1"
}
kubectl = {
source = "alekc/kubectl"
version = "2.0.1"
}
}
}
Loading

0 comments on commit 8444078

Please sign in to comment.