From c2af0c20ac630c9e0d6ac097523f5e5901f855a0 Mon Sep 17 00:00:00 2001 From: hariharandev1 Date: Thu, 29 Aug 2024 23:37:56 -0700 Subject: [PATCH 1/5] IMprove CI Performance. 1. Improve generation of hdf5 files it was taking too long. 2. Do not finalize in pytest this allows multiple tests to run together. --- .github/workflows/ci.yml | 85 ++----------------- .../data_generator/hdf5_generator.py | 41 ++++----- tests/dlio_benchmark_test.py | 3 +- 3 files changed, 27 insertions(+), 102 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8fe5ce04..e7e45df7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -81,91 +81,32 @@ jobs: - name: test_gen_data run: | source ${VENV_PATH}/bin/activate - mpirun -np 2 pytest -k test_gen_data[png-tensorflow] -v - mpirun -np 2 pytest -k test_gen_data[npz-tensorflow] -v - mpirun -np 2 pytest -k test_gen_data[jpeg-tensorflow] -v - mpirun -np 2 pytest -k test_gen_data[tfrecord-tensorflow] -v - mpirun -np 2 pytest -k test_gen_data[hdf5-tensorflow] -v - mpirun -np 2 pytest -k test_gen_data[indexed_binary-tensorflow] -v - mpirun -np 2 pytest -k test_gen_data[mmap_indexed_binary-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data -v --durations=0 rm -rf data - name: test_custom_storage_root_gen_data run: | source ${VENV_PATH}/bin/activate - mpirun -np 2 pytest -k test_storage_root_gen_data[png-tensorflow] -v - mpirun -np 2 pytest -k test_storage_root_gen_data[npz-tensorflow] -v - mpirun -np 2 pytest -k test_storage_root_gen_data[jpeg-tensorflow] -v - mpirun -np 2 pytest -k test_storage_root_gen_data[tfrecord-tensorflow] -v - mpirun -np 2 pytest -k test_storage_root_gen_data[hdf5-tensorflow] -v - mpirun -np 2 pytest -k test_storage_root_gen_data[indexed_binary-tensorflow] -v - mpirun -np 2 pytest -k test_storage_root_gen_data[mmap_indexed_binary-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data -v --durations=0 rm -rf data - name: test_train run: | source ${VENV_PATH}/bin/activate - mpirun -np 2 pytest -k test_train[png-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[npz-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[jpeg-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[tfrecord-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[hdf5-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[csv-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[png-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[npz-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[jpeg-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[hdf5-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[csv-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[png-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[npz-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[jpeg-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[hdf5-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[csv-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[png-pytorch-dali] -v - mpirun -np 2 pytest -k test_train[npz-pytorch-dali] -v - mpirun -np 2 pytest -k test_train[jpeg-pytorch-dali] -v - mpirun -np 2 pytest -k test_train[hdf5-pytorch-dali] -v - mpirun -np 2 pytest -k test_train[csv-pytorch-dali] -v - mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-dali] -v - mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-dali] -v + mpirun -np 2 pytest -k test_train -v --durations=0 rm -rf data - name: test_custom_storage_root_train run: | source ${VENV_PATH}/bin/activate - mpirun -np 2 pytest -k test_custom_storage_root_train[png-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[npz-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[tfrecord-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[csv-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[png-pytorch] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[npz-pytorch] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-pytorch] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-pytorch] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[csv-pytorch] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-pytorch] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train -v --durations=0 rm -rf data - name: test_checkpoint_epoch run: | source ${VENV_PATH}/bin/activate - mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers0-2-layer_params0-all_ranks] -v - mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers1-2-layer_params1-all_ranks] -v - mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers2-2-layer_params2-rank_zero] -v - mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers3-2-layer_params3-rank_zero] -v - mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers4-1-layer_params4-all_ranks] -v - mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers5-1-layer_params5-all_ranks] -v + mpirun -np 2 pytest -k test_checkpoint_epoch -v --durations=0 rm -rf data - name: test_checkpoint_step run: | source ${VENV_PATH}/bin/activate - mpirun -np 2 pytest -k test_checkpoint_step -v + mpirun -np 2 pytest -k test_checkpoint_step -v --durations=0 - name: test_eval run: | source ${VENV_PATH}/bin/activate @@ -173,26 +114,18 @@ jobs: - name: test_multi_threads run: | source ${VENV_PATH}/bin/activate - mpirun -np 2 pytest -k test_multi_threads[tensorflow-0] -v - mpirun -np 2 pytest -k test_multi_threads[tensorflow-1] -v - mpirun -np 2 pytest -k test_multi_threads[tensorflow-2] -v - mpirun -np 2 pytest -k test_multi_threads[pytorch-0] -v - mpirun -np 2 pytest -k test_multi_threads[pytorch-1] -v - mpirun -np 2 pytest -k test_multi_threads[pytorch-2] -v + mpirun -np 2 pytest -k test_multi_threads -v --durations=0 rm -rf data - name: test-pytorch-multiprocessing-context run: | source ${VENV_PATH}/bin/activate - mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[0-None] -v - mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[1-fork] -v - mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-forkserver] -v - mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-spawn] -v + mpirun -np 2 pytest -k test_pytorch_multiprocessing_context -v --durations=0 rm -rf data - name: test_subset run: | source ${VENV_PATH}/bin/activate rm -rf output data checkpoints - mpirun -np 2 pytest -k test_subset -v + mpirun -np 2 pytest -k test_subset -v --durations=0 rm -rf data - name: test-tf-loader-tfrecord run: | diff --git a/dlio_benchmark/data_generator/hdf5_generator.py b/dlio_benchmark/data_generator/hdf5_generator.py index 02fe1e68..01d0704c 100644 --- a/dlio_benchmark/data_generator/hdf5_generator.py +++ b/dlio_benchmark/data_generator/hdf5_generator.py @@ -45,38 +45,29 @@ def generate(self): """ super().generate() np.random.seed(10) - samples_per_iter=max(1, int(self._args.generation_buffer_size/self._args.record_length)) record_labels = [0] * self.num_samples dim = self.get_dimension(self.total_files_to_generate) + chunks = None + if self.enable_chunking: + chunk_dimension = int(math.ceil(math.sqrt(self.chunk_size))) + if chunk_dimension > self._dimension: + chunk_dimension = self._dimension + chunks = (1, chunk_dimension, chunk_dimension) + compression = None + compression_level = None + if self.compression != Compression.NONE: + compression = str(self.compression) + if self.compression == Compression.GZIP: + compression_level = self.compression_level for i in dlp.iter(range(self.my_rank, int(self.total_files_to_generate), self.comm_size)): - progress(i, self.total_files_to_generate, "Generating HDF5 Data") dim1 = dim[2*i] dim2 = dim[2*i+1] - records = np.random.randint(255, size=(samples_per_iter, dim1, dim2), dtype=np.uint8) + records = np.random.randint(255, size=(dim1, dim2, self.num_samples), dtype=np.uint8) out_path_spec = self.storage.get_uri(self._file_list[i]) + progress(i+1, self.total_files_to_generate, "Generating NPZ Data") hf = h5py.File(out_path_spec, 'w') - chunks = None - if self.enable_chunking: - chunk_dimension = int(math.ceil(math.sqrt(self.chunk_size))) - if chunk_dimension > self._dimension: - chunk_dimension = self._dimension - chunks = (1, chunk_dimension, chunk_dimension) - compression = None - compression_level = None - if self.compression != Compression.NONE: - compression = str(self.compression) - if self.compression == Compression.GZIP: - compression_level = self.compression_level - dset = hf.create_dataset('records', (self.num_samples, dim1, dim2), chunks=chunks, compression=compression, - compression_opts=compression_level, dtype=np.uint8) - samples_written = 0 - while samples_written < self.num_samples: - if samples_per_iter < self.num_samples-samples_written: - samples_to_write = samples_per_iter - else: - samples_to_write = self.num_samples-samples_written - dset[samples_written:samples_written+samples_to_write] = records[:samples_to_write] - samples_written += samples_to_write + hf.create_dataset('records', (self.num_samples, dim1, dim2), chunks=chunks, compression=compression, + compression_opts=compression_level, dtype=np.uint8, data=records) hf.create_dataset('labels', data=record_labels) hf.close() np.random.seed() diff --git a/tests/dlio_benchmark_test.py b/tests/dlio_benchmark_test.py index 7446919c..6c3b476d 100644 --- a/tests/dlio_benchmark_test.py +++ b/tests/dlio_benchmark_test.py @@ -48,7 +48,8 @@ def init(): DLIOMPI.get_instance().initialize() def finalize(): - DLIOMPI.get_instance().finalize() + # DLIOMPI.get_instance().finalize() + pass def clean(storage_root="./") -> None: comm.Barrier() From 7ccc57d433457ad4da5ffca4b2daeea03b99a1e7 Mon Sep 17 00:00:00 2001 From: hariharandev1 Date: Thu, 29 Aug 2024 23:41:39 -0700 Subject: [PATCH 2/5] Reduce DEBUG LEVEL --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e7e45df7..0352b63e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,9 +21,9 @@ jobs: CXX: g++-${{ matrix.gcc }} DFTRACER_BUILD_TYPE: "Debug" DFTRACER_ENABLE: 1 - DFTRACER_LOG_LEVEL: "DEBUG" + DFTRACER_LOG_LEVEL: "INFO" DLIO_EXEC: ${{ matrix.venv == 'via-setup' && 'dlio_benchmark' || 'python dlio_benchmark/main.py' }} - GOTCHA_DEBUG: 3 + GOTCHA_DEBUG: 1 OMPI_ALLOW_RUN_AS_ROOT: 1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1 PYTHON_VER: ${{ matrix.python }} From ee2ecbd43f2e0bfafbd86c325070064a09eb73a1 Mon Sep 17 00:00:00 2001 From: hariharandev1 Date: Thu, 29 Aug 2024 23:54:51 -0700 Subject: [PATCH 3/5] revert test --- .github/workflows/ci.yml | 91 ++++++++++++++++++++++++++++++++++------ 1 file changed, 79 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0352b63e..5a19dd01 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -81,32 +81,91 @@ jobs: - name: test_gen_data run: | source ${VENV_PATH}/bin/activate - mpirun -np 2 pytest -k test_gen_data -v --durations=0 + mpirun -np 2 pytest -k test_gen_data[png-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data[npz-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data[jpeg-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data[tfrecord-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data[hdf5-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data[indexed_binary-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data[mmap_indexed_binary-tensorflow] -v rm -rf data - name: test_custom_storage_root_gen_data run: | source ${VENV_PATH}/bin/activate - mpirun -np 2 pytest -k test_storage_root_gen_data -v --durations=0 + mpirun -np 2 pytest -k test_storage_root_gen_data[png-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data[npz-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data[jpeg-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data[tfrecord-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data[hdf5-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data[indexed_binary-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data[mmap_indexed_binary-tensorflow] -v rm -rf data - name: test_train run: | source ${VENV_PATH}/bin/activate - mpirun -np 2 pytest -k test_train -v --durations=0 + mpirun -np 2 pytest -k test_train[png-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[npz-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[jpeg-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[tfrecord-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[hdf5-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[csv-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[png-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[npz-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[jpeg-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[hdf5-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[csv-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[png-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[npz-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[jpeg-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[hdf5-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[csv-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[png-pytorch-dali] -v + mpirun -np 2 pytest -k test_train[npz-pytorch-dali] -v + mpirun -np 2 pytest -k test_train[jpeg-pytorch-dali] -v + mpirun -np 2 pytest -k test_train[hdf5-pytorch-dali] -v + mpirun -np 2 pytest -k test_train[csv-pytorch-dali] -v + mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-dali] -v + mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-dali] -v rm -rf data - name: test_custom_storage_root_train run: | source ${VENV_PATH}/bin/activate - mpirun -np 2 pytest -k test_custom_storage_root_train -v --durations=0 + mpirun -np 2 pytest -k test_custom_storage_root_train[png-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[npz-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[tfrecord-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[csv-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[png-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[npz-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[csv-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-pytorch] -v rm -rf data - name: test_checkpoint_epoch run: | source ${VENV_PATH}/bin/activate - mpirun -np 2 pytest -k test_checkpoint_epoch -v --durations=0 + mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers0-2-layer_params0-all_ranks] -v + mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers1-2-layer_params1-all_ranks] -v + mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers2-2-layer_params2-rank_zero] -v + mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers3-2-layer_params3-rank_zero] -v + mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers4-1-layer_params4-all_ranks] -v + mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers5-1-layer_params5-all_ranks] -v rm -rf data - name: test_checkpoint_step run: | source ${VENV_PATH}/bin/activate - mpirun -np 2 pytest -k test_checkpoint_step -v --durations=0 + mpirun -np 2 pytest -k test_checkpoint_step -v - name: test_eval run: | source ${VENV_PATH}/bin/activate @@ -114,18 +173,26 @@ jobs: - name: test_multi_threads run: | source ${VENV_PATH}/bin/activate - mpirun -np 2 pytest -k test_multi_threads -v --durations=0 + mpirun -np 2 pytest -k test_multi_threads[tensorflow-0] -v + mpirun -np 2 pytest -k test_multi_threads[tensorflow-1] -v + mpirun -np 2 pytest -k test_multi_threads[tensorflow-2] -v + mpirun -np 2 pytest -k test_multi_threads[pytorch-0] -v + mpirun -np 2 pytest -k test_multi_threads[pytorch-1] -v + mpirun -np 2 pytest -k test_multi_threads[pytorch-2] -v rm -rf data - name: test-pytorch-multiprocessing-context run: | source ${VENV_PATH}/bin/activate - mpirun -np 2 pytest -k test_pytorch_multiprocessing_context -v --durations=0 + mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[0-None] -v + mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[1-fork] -v + mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-forkserver] -v + mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-spawn] -v rm -rf data - name: test_subset run: | source ${VENV_PATH}/bin/activate rm -rf output data checkpoints - mpirun -np 2 pytest -k test_subset -v --durations=0 + mpirun -np 2 pytest -k test_subset -v rm -rf data - name: test-tf-loader-tfrecord run: | @@ -160,9 +227,9 @@ jobs: run: | source ${VENV_PATH}/bin/activate rm -rf output data checkpoints - mpirun -np 2 ${DLIO_EXEC} workload=resnet50_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 - mpirun -np 2 ${DLIO_EXEC} workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 - mpirun -np 2 ${DLIO_EXEC} workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 ++workload.dataset.format=synthetic + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 ++workload.dataset.format=synthetic rm -rf data - name: test_cosmoflow run: | From 4e6019e776f19faa9a95e50c57c8e7e58c489ab3 Mon Sep 17 00:00:00 2001 From: hariharandev1 Date: Fri, 30 Aug 2024 00:15:33 -0700 Subject: [PATCH 4/5] reduce threads not enough data --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5a19dd01..b90f39c2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -227,9 +227,9 @@ jobs: run: | source ${VENV_PATH}/bin/activate rm -rf output data checkpoints - mpirun -np 2 ${DLIO_EXEC} workload=resnet50_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 - mpirun -np 2 ${DLIO_EXEC} workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 - mpirun -np 2 ${DLIO_EXEC} workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 ++workload.dataset.format=synthetic + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 ++workload.reader.read_threads=1 + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 ++workload.reader.read_threads=1 + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 ++workload.reader.read_threads=1 ++workload.dataset.format=synthetic rm -rf data - name: test_cosmoflow run: | From 8fefd7d95b8b4425d64fd3df65afeada07f54747 Mon Sep 17 00:00:00 2001 From: hariharandev1 Date: Fri, 30 Aug 2024 00:16:18 -0700 Subject: [PATCH 5/5] increase files to support some I/O --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b90f39c2..924e1d86 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -227,9 +227,9 @@ jobs: run: | source ${VENV_PATH}/bin/activate rm -rf output data checkpoints - mpirun -np 2 ${DLIO_EXEC} workload=resnet50_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 ++workload.reader.read_threads=1 - mpirun -np 2 ${DLIO_EXEC} workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 ++workload.reader.read_threads=1 - mpirun -np 2 ${DLIO_EXEC} workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 ++workload.reader.read_threads=1 ++workload.dataset.format=synthetic + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.reader.read_threads=1 + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.reader.read_threads=1 + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.reader.read_threads=1 ++workload.dataset.format=synthetic rm -rf data - name: test_cosmoflow run: |