diff --git a/benchmarks/benchmark/tools/dlio/README.md b/benchmarks/benchmark/tools/dlio/README.md index 99dd87b75..8c87aab2b 100644 --- a/benchmarks/benchmark/tools/dlio/README.md +++ b/benchmarks/benchmark/tools/dlio/README.md @@ -14,6 +14,8 @@ Preinstall the following on your computer: Note: Terraform keeps state metadata in a local file called `terraform.tfstate`. If you need to reinstall any resources, make sure to delete this file as well. +The workload identity and `k8s_service_account` should be set up for the `gcs_bucket` and `result_bucket` correctly ahead of time because DLIO jobs need to read from and write to them respectively. + ## Run DLIO Job 1. Update the `variables.tf` file with your desired settings to run your machine learning benchmark workload 2. Change the dlio image in `modules/dlio/podspec.tpl` @@ -22,20 +24,22 @@ If you need to reinstall any resources, make sure to delete this file as well. 5. After you finish your test, run `terraform destroy` to delete the resources +*__Important__*: To isolate results from different runs, `${dlio_benchmark_result}` should be unique to each run. + ## Run DLIO Job with Parallelstore -Pre-reqs: right now you'll need to manually setup the VPC peering from the GKE cluster's network to `servicenetworking.googleapis.com`. +Pre-reqs: +- You'll need to manually setup the VPC peering from the GKE cluster's network to `servicenetworking.googleapis.com`. -1. update `variables.tf` file with your desired settings to run your machine learning benchmark workload, notably set `gcs_fuse_csi_driver_enabled` to `false` and `paralllestore_csi_driver_enabled` to `true`. If you want to use static provisioning, update the "parallelstore variables" and `parallelstore_storageclass` to `""`. +1. update `variables.tf` file with your desired settings to run your machine learning benchmark workload, notably set `run_with_gcs_fuse_csi` to `false` and `run_with_parallelstore_csi` to `true`. If you want to use static provisioning, update the "parallelstore variables" and `parallelstore_storageclass` to `""`. 2. Change the dlio image in `dlio/podspec.tpl` to a desired version. We have tested the job with dlio v0.5.1. 3. run `terraform init` 4. run `terraform apply -target=module.ps_storage` 5. run `terraform apply` after the dataloader job is completed; pvc patch failure is OK for dynamic provisioning. +*__Important__*: To isolate results from different runs, `${dlio_benchmark_result}` should be unique to each run. + ## Check Test Result -The test result reports are located in the `${dlio_benchmark_result}` directory. For example, -if you use a GCS bucket to store the training dataset, the GCS bucket will be mounted at -`${dlio_data_mount_path}`, and you can find the test result reports at `${dlio_data_mount_path}/${dlio_benchmark_result}` -or in the folder with the same name as `${dlio_benchmark_result}` in your GCS bucket. +The test result reports are located in provided GCS bucket `${result_bucket}` in a directory named `${dlio_benchmark_result}`. ## Debug Workload diff --git a/benchmarks/benchmark/tools/dlio/main.tf b/benchmarks/benchmark/tools/dlio/main.tf index 20e8027b9..b7d6eb327 100644 --- a/benchmarks/benchmark/tools/dlio/main.tf +++ b/benchmarks/benchmark/tools/dlio/main.tf @@ -20,7 +20,7 @@ provider "kubectl" { module "gcs_pv_pvc" { source = "./modules/storage" - count = var.gcs_fuse_csi_driver_enabled == "\"true\"" ? 1 : 0 + count = var.run_with_gcs_fuse_csi == "\"true\"" ? 1 : 0 namespace = var.namespace pv_name = var.pv_name @@ -33,7 +33,7 @@ module "gcs_pv_pvc" { module "ps_storage" { source = "./modules/parallelstore_storage" - count = var.paralllestore_csi_driver_enabled == "\"true\"" ? 1 : 0 + count = var.run_with_parallelstore_csi == "\"true\"" ? 1 : 0 pv_name = var.pv_name pvc_name = var.pvc_name @@ -59,11 +59,11 @@ module "dlio" { job_backoffLimit = var.job_backoffLimit job_completions = var.job_completions job_parallelism = var.job_parallelism - gcs_fuse_csi_driver_enabled = var.gcs_fuse_csi_driver_enabled + gcs_fuse_csi_driver_enabled = var.run_with_gcs_fuse_csi gcs_fuse_sidecar_cpu_limit = var.gcs_fuse_sidecar_cpu_limit gcs_fuse_sidecar_memory_limit = var.gcs_fuse_sidecar_memory_limit gcs_fuse_sidecar_ephemeral_storage_limit = var.gcs_fuse_sidecar_ephemeral_storage_limit - pscsi_driver_enabled = var.paralllestore_csi_driver_enabled + pscsi_driver_enabled = var.run_with_parallelstore_csi pscsi_sidecar_cpu_limit = var.pscsi_sidecar_cpu_limit pscsi_sidecar_memory_limit = var.pscsi_sidecar_memory_limit dlio_container_cpu_limit = var.dlio_container_cpu_limit @@ -84,6 +84,7 @@ module "dlio" { dlio_iostat_devices = var.dlio_iostat_devices dlio_read_threads = var.dlio_read_threads gcs_bucket = var.gcs_bucket + result_bucket = var.result_bucket k8s_service_account = var.k8s_service_account pvc_name = var.pvc_name } diff --git a/benchmarks/benchmark/tools/dlio/modules/dlio/job.tf b/benchmarks/benchmark/tools/dlio/modules/dlio/job.tf index be3019eed..f1d0ce9e8 100644 --- a/benchmarks/benchmark/tools/dlio/modules/dlio/job.tf +++ b/benchmarks/benchmark/tools/dlio/modules/dlio/job.tf @@ -43,6 +43,7 @@ resource "local_file" "podspec" { dlio_iostat_devices = "${var.dlio_iostat_devices}" dlio_read_threads = "${var.dlio_read_threads}" gcs_bucket = "${var.gcs_bucket}" + result_bucket = "${var.result_bucket}" service_account = "${var.k8s_service_account}" pvc_name = "${var.pvc_name}" }) diff --git a/benchmarks/benchmark/tools/dlio/modules/dlio/podspec.tpl b/benchmarks/benchmark/tools/dlio/modules/dlio/podspec.tpl index dd58b988a..62ff1f6d8 100644 --- a/benchmarks/benchmark/tools/dlio/modules/dlio/podspec.tpl +++ b/benchmarks/benchmark/tools/dlio/modules/dlio/podspec.tpl @@ -12,7 +12,7 @@ spec: labels: app: dlio-job annotations: - gke-gcsfuse/volumes: ${gcs_fuse_csi_driver_enabled} + gke-gcsfuse/volumes: "true" gke-gcsfuse/cpu-limit: ${gcs_fuse_sidecar_cpu_limit} gke-gcsfuse/memory-limit: ${gcs_fuse_sidecar_memory_limit} gke-gcsfuse/ephemeral-storage-limit: ${gcs_fuse_sidecar_ephemeral_storage_limit} @@ -51,8 +51,8 @@ spec: python dlio_postprocessor.py --output-folder $OUTPUT_FOLDER; rm $OUTPUT_FOLDER/\.*\.pfw; echo 'copying results'; - mkdir -p ${dlio_data_mount_path}/${dlio_benchmark_result}/$MY_POD_NAME; - cp -r $OUTPUT_FOLDER ${dlio_data_mount_path}/${dlio_benchmark_result}/$MY_POD_NAME; + mkdir -p /dlio_results/${dlio_benchmark_result}/$MY_POD_NAME; + cp -r $OUTPUT_FOLDER /dlio_results/${dlio_benchmark_result}/$MY_POD_NAME; echo 'done'; fi env: @@ -67,11 +67,18 @@ spec: mountPath: ${dlio_data_mount_path} - name: dshm mountPath: /dev/shm + - name: results + mountPath: /dlio_results serviceAccountName: ${service_account} volumes: - name: ml-perf-volume persistentVolumeClaim: claimName: ${pvc_name} + - name: results + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: ${result_bucket} - name: dshm emptyDir: medium: Memory diff --git a/benchmarks/benchmark/tools/dlio/modules/dlio/variables.tf b/benchmarks/benchmark/tools/dlio/modules/dlio/variables.tf index e5f4c5979..7cb4e6303 100644 --- a/benchmarks/benchmark/tools/dlio/modules/dlio/variables.tf +++ b/benchmarks/benchmark/tools/dlio/modules/dlio/variables.tf @@ -27,6 +27,11 @@ variable "gcs_bucket" { description = "GCS Bucket name" } +variable "result_bucket" { + type = string + description = "GCS Bucket name" +} + variable "pvc_name" { type = string description = "Name of the PersistentVolumeClaim used for DLIO dataset" diff --git a/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/variables.tf b/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/variables.tf index 1dcdbe5d4..190a48fa2 100644 --- a/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/variables.tf +++ b/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/variables.tf @@ -1,4 +1,4 @@ -# Copyright 2023 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/benchmarks/benchmark/tools/dlio/parser.py b/benchmarks/benchmark/tools/dlio/parser.py new file mode 100644 index 000000000..b103355b8 --- /dev/null +++ b/benchmarks/benchmark/tools/dlio/parser.py @@ -0,0 +1,62 @@ +import os +import json +import datetime + +RESULT_FOLDER = './tmp' + +START_TIME = 'start' +END_TIME = 'end' +GPU = 'train_au_percentage' +M_GPU = 'train_au_mean_percentage' +SAMPLE_THROUGHPUT = 'train_throughput_samples_per_second' +M_SAMPLE_THROUGHPUT = 'train_throughput_mean_samples_per_second' +M_MB = "train_io_mean_MB_per_second" +DURATION = 'duration' + + +def average(numbers): + return sum(numbers) / len(numbers) + +def process_summary(summary): + metric = summary['metric'] + gpu = metric[M_GPU] + spp = metric[M_SAMPLE_THROUGHPUT] + mmb = metric[M_MB] + fe_gpu_percentage = metric[GPU][0] + fe_samples_per_second = metric[SAMPLE_THROUGHPUT][0] + sub_gpu_percentage = average(metric[GPU][1:]) if len(metric[GPU]) > 1 else -1 + sub_spp = average(metric[SAMPLE_THROUGHPUT][1:]) if len(metric[SAMPLE_THROUGHPUT]) > 1 else -1 + start_time = summary[START_TIME] + end_time = summary[END_TIME] + total_time = datetime.datetime.strptime(end_time, "%Y-%m-%dT%H:%M:%S.%f") - datetime.datetime.strptime(start_time, "%Y-%m-%dT%H:%M:%S.%f") + return total_time.total_seconds(), fe_gpu_percentage, fe_samples_per_second, sub_gpu_percentage, sub_spp, gpu, spp, mmb + +headers = ['e2e training seconds', 'first epoch au percentage', 'first epoch throughput samples per second', 'subsequent epochs average au percentage', 'subsequent epochs throughput samples per second', + 'mean au percentage', 'mean throughput samples per second', 'mean MB per second'] + +def process_per_epoch_stats(epochs): + fe_duration = float(epochs['1'][DURATION]) + sq_durations = [] + for i in range(2, len(epochs)): + sq_durations.append(float(epochs[str(i)][DURATION])) + sq_avg_duration = average(sq_durations) if len(sq_durations) > 0 else -1 + return fe_duration, sq_avg_duration + +per_epoch_headers = ['first epoch duration seconds', "subsequent epochs average duration seconds"] + +summary_results = [] +per_epoch_results = [] +for root, dirs, files in os.walk(RESULT_FOLDER): + for file in files: + if file == 'summary.json': + with open(root +'/'+ file) as f: + d = json.load(f) + summary_results.append(process_summary(d)) + if file == 'per_epoch_stats.json': + with open(root +'/'+ file) as f: + d = json.load(f) + per_epoch_results.append(process_per_epoch_stats(d)) + + +print(list(zip(headers, list(map(average, zip(*summary_results)))))) +print(list(zip(per_epoch_headers, list(map(average, zip(*per_epoch_results)))))) diff --git a/benchmarks/benchmark/tools/dlio/variables.tf b/benchmarks/benchmark/tools/dlio/variables.tf index 0a304a3d6..681e36f89 100644 --- a/benchmarks/benchmark/tools/dlio/variables.tf +++ b/benchmarks/benchmark/tools/dlio/variables.tf @@ -30,14 +30,20 @@ variable "gcs_bucket" { default = "" } +variable "result_bucket" { + type = string + description = "GCS Bucket name to store dlio results" + default = "" +} + // at most one of the below trigers can be set to true -variable "gcs_fuse_csi_driver_enabled" { +variable "run_with_gcs_fuse_csi" { type = string - description = "Set to true if running DLIO on GCSFuse and the Cloud Storage FUSE CSI driver is enabled on your cluster" + description = "Set to true if running DLIO on GCSFuse" default = "\"true\"" } -variable "paralllestore_csi_driver_enabled" { +variable "run_with_parallelstore_csi" { type = string description = "Set to true if running DLIO on Parallelstore and the Parallelstore CSI driver is enabled on your cluster" default = "\"false\"" @@ -119,8 +125,8 @@ variable "dlio_data_mount_path" { variable "dlio_benchmark_result" { type = string - description = "The path stores benchmark result reports" - default = "results" + description = "The path stores benchmark result reports for a specific DLIO run. When doing multi-pod runs, this folder stores results logged from all the pods, needs to be changed every run to guarantee result isolation." + default = "" } // DLIO configurations, detailed explanation check