write back dlio result to a separate result bucket and add parser scr…

…ipt (GoogleCloudPlatform#313) write back dlio result to a separate result bucket
annapendleton · Mar 26, 2024 · 394eee1 · 394eee1
1 parent 753b871
commit 394eee1
Show file tree

Hide file tree

Showing 8 changed files with 105 additions and 19 deletions.
diff --git a/benchmarks/benchmark/tools/dlio/README.md b/benchmarks/benchmark/tools/dlio/README.md
@@ -14,6 +14,8 @@ Preinstall the following on your computer:
 Note: Terraform keeps state metadata in a local file called `terraform.tfstate`.
 If you need to reinstall any resources, make sure to delete this file as well.
 
+The workload identity and `k8s_service_account` should be set up for the `gcs_bucket` and `result_bucket` correctly ahead of time because DLIO jobs need to read from and write to them respectively.
+
 ## Run DLIO Job
 1. Update the `variables.tf` file with your desired settings to run your machine learning benchmark workload
 2. Change the dlio image in `modules/dlio/podspec.tpl`
@@ -22,20 +24,22 @@ If you need to reinstall any resources, make sure to delete this file as well.
 5. After you finish your test, run `terraform destroy` to delete the
    resources
 
+*__Important__*: To isolate results from different runs, `${dlio_benchmark_result}` should be unique to each run.
+
 ## Run DLIO Job with Parallelstore
-Pre-reqs: right now you'll need to manually setup the VPC peering from the GKE cluster's network to `servicenetworking.googleapis.com`.
+Pre-reqs:
+- You'll need to manually setup the VPC peering from the GKE cluster's network to `servicenetworking.googleapis.com`.
 
-1. update `variables.tf` file with your desired settings to run your machine learning benchmark workload, notably set `gcs_fuse_csi_driver_enabled` to `false` and `paralllestore_csi_driver_enabled` to `true`. If you want to use static provisioning, update the "parallelstore variables" and `parallelstore_storageclass` to `""`.
+1. update `variables.tf` file with your desired settings to run your machine learning benchmark workload, notably set `run_with_gcs_fuse_csi` to `false` and `run_with_parallelstore_csi` to `true`. If you want to use static provisioning, update the "parallelstore variables" and `parallelstore_storageclass` to `""`.
 2. Change the dlio image in `dlio/podspec.tpl` to a desired version. We have tested the job with dlio v0.5.1.
 3. run `terraform init`
 4. run `terraform apply -target=module.ps_storage`
 5. run `terraform apply` after the dataloader job is completed; pvc patch failure is OK for dynamic provisioning.
 
+*__Important__*: To isolate results from different runs, `${dlio_benchmark_result}` should be unique to each run.
+
 ## Check Test Result
-The test result reports are located in the `${dlio_benchmark_result}` directory. For example,
-if you use a GCS bucket to store the training dataset, the GCS bucket will be mounted at
-`${dlio_data_mount_path}`, and you can find the test result reports at `${dlio_data_mount_path}/${dlio_benchmark_result}`
-or in the folder with the same name as `${dlio_benchmark_result}` in your GCS bucket.
+The test result reports are located in provided GCS bucket `${result_bucket}` in a directory named `${dlio_benchmark_result}`.
 
 ## Debug Workload
 

diff --git a/benchmarks/benchmark/tools/dlio/main.tf b/benchmarks/benchmark/tools/dlio/main.tf
@@ -20,7 +20,7 @@ provider "kubectl" {
 
 module "gcs_pv_pvc" {
   source = "./modules/storage"
-  count  = var.gcs_fuse_csi_driver_enabled == "\"true\"" ? 1 : 0
+  count  = var.run_with_gcs_fuse_csi == "\"true\"" ? 1 : 0
 
   namespace                   = var.namespace
   pv_name                     = var.pv_name
@@ -33,7 +33,7 @@ module "gcs_pv_pvc" {
 
 module "ps_storage" {
   source = "./modules/parallelstore_storage"
-  count  = var.paralllestore_csi_driver_enabled == "\"true\"" ? 1 : 0
+  count  = var.run_with_parallelstore_csi == "\"true\"" ? 1 : 0
 
   pv_name                       = var.pv_name
   pvc_name                      = var.pvc_name
@@ -59,11 +59,11 @@ module "dlio" {
   job_backoffLimit                         = var.job_backoffLimit
   job_completions                          = var.job_completions
   job_parallelism                          = var.job_parallelism
-  gcs_fuse_csi_driver_enabled              = var.gcs_fuse_csi_driver_enabled
+  gcs_fuse_csi_driver_enabled              = var.run_with_gcs_fuse_csi
   gcs_fuse_sidecar_cpu_limit               = var.gcs_fuse_sidecar_cpu_limit
   gcs_fuse_sidecar_memory_limit            = var.gcs_fuse_sidecar_memory_limit
   gcs_fuse_sidecar_ephemeral_storage_limit = var.gcs_fuse_sidecar_ephemeral_storage_limit
-  pscsi_driver_enabled                     = var.paralllestore_csi_driver_enabled
+  pscsi_driver_enabled                     = var.run_with_parallelstore_csi
   pscsi_sidecar_cpu_limit                  = var.pscsi_sidecar_cpu_limit
   pscsi_sidecar_memory_limit               = var.pscsi_sidecar_memory_limit
   dlio_container_cpu_limit                 = var.dlio_container_cpu_limit
@@ -84,6 +84,7 @@ module "dlio" {
   dlio_iostat_devices                      = var.dlio_iostat_devices
   dlio_read_threads                        = var.dlio_read_threads
   gcs_bucket                               = var.gcs_bucket
+  result_bucket                            = var.result_bucket
   k8s_service_account                      = var.k8s_service_account
   pvc_name                                 = var.pvc_name
 }
diff --git a/benchmarks/benchmark/tools/dlio/modules/dlio/job.tf b/benchmarks/benchmark/tools/dlio/modules/dlio/job.tf
@@ -43,6 +43,7 @@ resource "local_file" "podspec" {
     dlio_iostat_devices                      = "${var.dlio_iostat_devices}"
     dlio_read_threads                        = "${var.dlio_read_threads}"
     gcs_bucket                               = "${var.gcs_bucket}"
+    result_bucket                            = "${var.result_bucket}"
     service_account                          = "${var.k8s_service_account}"
     pvc_name                                 = "${var.pvc_name}"
   })

diff --git a/benchmarks/benchmark/tools/dlio/modules/dlio/podspec.tpl b/benchmarks/benchmark/tools/dlio/modules/dlio/podspec.tpl
@@ -12,7 +12,7 @@ spec:
       labels:
         app: dlio-job
       annotations:
-        gke-gcsfuse/volumes: ${gcs_fuse_csi_driver_enabled}
+        gke-gcsfuse/volumes: "true"
         gke-gcsfuse/cpu-limit: ${gcs_fuse_sidecar_cpu_limit}
         gke-gcsfuse/memory-limit: ${gcs_fuse_sidecar_memory_limit}
         gke-gcsfuse/ephemeral-storage-limit: ${gcs_fuse_sidecar_ephemeral_storage_limit}
@@ -51,8 +51,8 @@ spec:
               python dlio_postprocessor.py --output-folder $OUTPUT_FOLDER;
               rm $OUTPUT_FOLDER/\.*\.pfw;
               echo 'copying results';
-              mkdir -p ${dlio_data_mount_path}/${dlio_benchmark_result}/$MY_POD_NAME;
-              cp -r $OUTPUT_FOLDER ${dlio_data_mount_path}/${dlio_benchmark_result}/$MY_POD_NAME;
+              mkdir -p /dlio_results/${dlio_benchmark_result}/$MY_POD_NAME;
+              cp -r $OUTPUT_FOLDER /dlio_results/${dlio_benchmark_result}/$MY_POD_NAME;
               echo 'done';
             fi
         env:
@@ -67,11 +67,18 @@ spec:
           mountPath: ${dlio_data_mount_path}
         - name: dshm
           mountPath: /dev/shm
+        - name: results
+          mountPath: /dlio_results
       serviceAccountName: ${service_account}
       volumes:
       - name: ml-perf-volume
         persistentVolumeClaim:
           claimName: ${pvc_name}
+      - name: results
+        csi:
+          driver: gcsfuse.csi.storage.gke.io
+          volumeAttributes:
+            bucketName: ${result_bucket}
       - name: dshm
         emptyDir:
           medium: Memory

diff --git a/benchmarks/benchmark/tools/dlio/modules/dlio/variables.tf b/benchmarks/benchmark/tools/dlio/modules/dlio/variables.tf
@@ -27,6 +27,11 @@ variable "gcs_bucket" {
   description = "GCS Bucket name"
 }
 
+variable "result_bucket" {
+  type        = string
+  description = "GCS Bucket name"
+}
+
 variable "pvc_name" {
   type        = string
   description = "Name of the PersistentVolumeClaim used for DLIO dataset"

diff --git a/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/variables.tf b/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/variables.tf
@@ -1,4 +1,4 @@
-# Copyright 2023 Google LLC
+# Copyright 2024 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/benchmarks/benchmark/tools/dlio/parser.py b/benchmarks/benchmark/tools/dlio/parser.py
@@ -0,0 +1,62 @@
+import os
+import json
+import datetime
+
+RESULT_FOLDER = './tmp'
+
+START_TIME = 'start'
+END_TIME = 'end'
+GPU = 'train_au_percentage'
+M_GPU = 'train_au_mean_percentage'
+SAMPLE_THROUGHPUT = 'train_throughput_samples_per_second'
+M_SAMPLE_THROUGHPUT = 'train_throughput_mean_samples_per_second'
+M_MB = "train_io_mean_MB_per_second"
+DURATION = 'duration'
+
+
+def average(numbers):
+  return sum(numbers) / len(numbers)
+
+def process_summary(summary):
+  metric = summary['metric']
+  gpu = metric[M_GPU]
+  spp = metric[M_SAMPLE_THROUGHPUT]
+  mmb = metric[M_MB]
+  fe_gpu_percentage = metric[GPU][0]
+  fe_samples_per_second = metric[SAMPLE_THROUGHPUT][0]
+  sub_gpu_percentage = average(metric[GPU][1:]) if len(metric[GPU]) > 1 else -1
+  sub_spp = average(metric[SAMPLE_THROUGHPUT][1:])  if len(metric[SAMPLE_THROUGHPUT]) > 1 else -1
+  start_time = summary[START_TIME]
+  end_time = summary[END_TIME]
+  total_time = datetime.datetime.strptime(end_time, "%Y-%m-%dT%H:%M:%S.%f") - datetime.datetime.strptime(start_time, "%Y-%m-%dT%H:%M:%S.%f")
+  return total_time.total_seconds(), fe_gpu_percentage, fe_samples_per_second, sub_gpu_percentage, sub_spp, gpu, spp, mmb
+
+headers = ['e2e training seconds', 'first epoch au percentage', 'first epoch throughput samples per second', 'subsequent epochs average au percentage', 'subsequent epochs throughput samples per second',
+           'mean au percentage', 'mean throughput samples per second', 'mean MB per second']
+
+def process_per_epoch_stats(epochs):
+  fe_duration = float(epochs['1'][DURATION])
+  sq_durations = []
+  for i in range(2, len(epochs)):
+    sq_durations.append(float(epochs[str(i)][DURATION]))
+  sq_avg_duration = average(sq_durations) if len(sq_durations) > 0 else -1
+  return fe_duration, sq_avg_duration
+
+per_epoch_headers = ['first epoch duration seconds', "subsequent epochs average duration seconds"]
+
+summary_results = []
+per_epoch_results = []
+for root, dirs, files in os.walk(RESULT_FOLDER):
+  for file in files:
+    if file == 'summary.json':
+      with open(root +'/'+ file) as f:
+        d = json.load(f)
+        summary_results.append(process_summary(d))
+    if file == 'per_epoch_stats.json':
+      with open(root +'/'+ file) as f:
+        d = json.load(f)
+        per_epoch_results.append(process_per_epoch_stats(d))
+
+
+print(list(zip(headers, list(map(average, zip(*summary_results))))))
+print(list(zip(per_epoch_headers, list(map(average, zip(*per_epoch_results))))))
diff --git a/benchmarks/benchmark/tools/dlio/variables.tf b/benchmarks/benchmark/tools/dlio/variables.tf
@@ -30,14 +30,20 @@ variable "gcs_bucket" {
   default     = "<your gcs bucket>"
 }
 
+variable "result_bucket" {
+  type        = string
+  description = "GCS Bucket name to store dlio results"
+  default     = "<result bucket>"
+}
+
 // at most one of the below trigers can be set to true
-variable "gcs_fuse_csi_driver_enabled" {
+variable "run_with_gcs_fuse_csi" {
   type        = string
-  description = "Set to true if running DLIO on GCSFuse and the Cloud Storage FUSE CSI driver is enabled on your cluster"
+  description = "Set to true if running DLIO on GCSFuse"
   default     = "\"true\""
 }
 
-variable "paralllestore_csi_driver_enabled" {
+variable "run_with_parallelstore_csi" {
   type        = string
   description = "Set to true if running DLIO on Parallelstore and the Parallelstore CSI driver is enabled on your cluster"
   default     = "\"false\""
@@ -119,8 +125,8 @@ variable "dlio_data_mount_path" {
 
 variable "dlio_benchmark_result" {
   type        = string
-  description = "The path stores benchmark result reports"
-  default     = "results"
+  description = "The path stores benchmark result reports for a specific DLIO run. When doing multi-pod runs, this folder stores results logged from all the pods, needs to be changed every run to guarantee result isolation."
+  default     = "<a result folder name unique to your run>"
 }
 
 // DLIO configurations, detailed explanation check