Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes to support native Data loader and readers #81

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 59 additions & 51 deletions .github/workflows/python-package-conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,57 +57,65 @@ jobs:
- name: test_gen_data
run: |
source ${VENV}/bin/activate
mpirun -np 2 pytest -k test_gen_data[png-tensorflow] -v
mpirun -np 2 pytest -k test_gen_data[npz-tensorflow] -v
mpirun -np 2 pytest -k test_gen_data[jpeg-tensorflow] -v
mpirun -np 2 pytest -k test_gen_data[tfrecord-tensorflow] -v
mpirun -np 2 pytest -k test_gen_data[hdf5-tensorflow] -v
mpirun -np 2 pytest -k test_gen_data[dlio_png-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_gen_data[dlio_npz-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_gen_data[dlio_jpeg-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_gen_data[dlio_tfrecord-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_gen_data[dlio_hdf5-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_gen_data[dali_npz-pytorch-native_dali] -v

- name: test_custom_storage_root_gen_data
run: |
source ${VENV}/bin/activate
mpirun -np 2 pytest -k test_storage_root_gen_data[png-tensorflow] -v
mpirun -np 2 pytest -k test_storage_root_gen_data[npz-tensorflow] -v
mpirun -np 2 pytest -k test_storage_root_gen_data[jpeg-tensorflow] -v
mpirun -np 2 pytest -k test_storage_root_gen_data[tfrecord-tensorflow] -v
mpirun -np 2 pytest -k test_storage_root_gen_data[hdf5-tensorflow] -v
mpirun -np 2 pytest -k test_storage_root_gen_data[dlio_png-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_storage_root_gen_data[dlio_npz-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_storage_root_gen_data[dlio_jpeg-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_storage_root_gen_data[dlio_tfrecord-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_storage_root_gen_data[dlio_hdf5-tensorflow-dlio_tensorflow] -v

- name: test_train
run: |
source ${VENV}/bin/activate
mpirun -np 2 pytest -k test_train[png-tensorflow-tensorflow] -v
mpirun -np 2 pytest -k test_train[npz-tensorflow-tensorflow] -v
mpirun -np 2 pytest -k test_train[jpeg-tensorflow-tensorflow] -v
mpirun -np 2 pytest -k test_train[tfrecord-tensorflow-tensorflow] -v
mpirun -np 2 pytest -k test_train[hdf5-tensorflow-tensorflow] -v
mpirun -np 2 pytest -k test_train[csv-tensorflow-tensorflow] -v
mpirun -np 2 pytest -k test_train[png-pytorch-pytorch] -v
mpirun -np 2 pytest -k test_train[npz-pytorch-pytorch] -v
mpirun -np 2 pytest -k test_train[jpeg-pytorch-pytorch] -v
mpirun -np 2 pytest -k test_train[hdf5-pytorch-pytorch] -v
mpirun -np 2 pytest -k test_train[csv-pytorch-pytorch] -v
mpirun -np 2 pytest -k test_train[png-tensorflow-dali] -v
mpirun -np 2 pytest -k test_train[npz-tensorflow-dali] -v
mpirun -np 2 pytest -k test_train[jpeg-tensorflow-dali] -v
mpirun -np 2 pytest -k test_train[hdf5-tensorflow-dali] -v
mpirun -np 2 pytest -k test_train[csv-tensorflow-dali] -v
mpirun -np 2 pytest -k test_train[png-pytorch-dali] -v
mpirun -np 2 pytest -k test_train[npz-pytorch-dali] -v
mpirun -np 2 pytest -k test_train[jpeg-pytorch-dali] -v
mpirun -np 2 pytest -k test_train[hdf5-pytorch-dali] -v
mpirun -np 2 pytest -k test_train[csv-pytorch-dali] -v
mpirun -np 2 pytest -k test_train[dlio_png-tensorflow-dlio_tensorflow0] -v
mpirun -np 2 pytest -k test_train[dlio_npz-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_train[dlio_jpeg-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_train[dlio_tfrecord-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_train[dlio_hdf5-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_train[dlio_csv-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_train[dlio_png-pytorch-dlio_pytorch] -v
mpirun -np 2 pytest -k test_train[dlio_npz-pytorch-dlio_pytorch] -v
mpirun -np 2 pytest -k test_train[dlio_jpeg-pytorch-dlio_pytorch] -v
mpirun -np 2 pytest -k test_train[dlio_hdf5-pytorch-dlio_pytorch] -v
mpirun -np 2 pytest -k test_train[dlio_csv-pytorch-dlio_pytorch] -v
mpirun -np 2 pytest -k test_train[dlio_png-tensorflow-dlio_dali] -v
mpirun -np 2 pytest -k test_train[dlio_npz-tensorflow-dlio_dali] -v
mpirun -np 2 pytest -k test_train[dlio_jpeg-tensorflow-dlio_dali] -v
mpirun -np 2 pytest -k test_train[dlio_hdf5-tensorflow-dlio_dali] -v
mpirun -np 2 pytest -k test_train[dlio_csv-tensorflow-dlio_dali] -v
mpirun -np 2 pytest -k test_train[dlio_png-pytorch-dlio_dali] -v
mpirun -np 2 pytest -k test_train[dlio_npz-pytorch-dlio_dali] -v
mpirun -np 2 pytest -k test_train[dlio_jpeg-pytorch-dlio_dali] -v
mpirun -np 2 pytest -k test_train[dlio_hdf5-pytorch-dlio_dali] -v
mpirun -np 2 pytest -k test_train[dlio_csv-pytorch-dlio_dali] -v
mpirun -np 2 pytest -k test_train[dlio_png-tensorflow-dlio_tensorflow1] -v
mpirun -np 2 pytest -k test_train[tf_tfrecord-tensorflow-native_tensorflow] -v
mpirun -np 2 pytest -k test_train[dali_tfrecord-pytorch-native_dali] -v
mpirun -np 2 pytest -k test_train[dali_npz-pytorch-native_dali] -v

- name: test_custom_storage_root_train
run: |
source ${VENV}/bin/activate
mpirun -np 2 pytest -k test_custom_storage_root_train[png-tensorflow] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[npz-tensorflow] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-tensorflow] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[tfrecord-tensorflow] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-tensorflow] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[csv-tensorflow] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[png-pytorch] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[npz-pytorch] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-pytorch] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-pytorch] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[csv-pytorch] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[dlio_png-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[dlio_npz-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[dlio_jpeg-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[dlio_tfrecord-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[dlio_hdf5-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[dlio_csv-tensorflow-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[dlio_png-pytorch-dlio_pytorch] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[dlio_npz-pytorch-dlio_pytorch] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[dlio_jpeg-pytorch-dlio_pytorch] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[dlio_hdf5-pytorch-dlio_pytorch] -v
mpirun -np 2 pytest -k test_custom_storage_root_train[dlio_csv-pytorch-dlio_pytorch] -v
- name: test_checkpoint_epoch
run: |
source ${VENV}/bin/activate
Expand All @@ -123,12 +131,12 @@ jobs:
- name: test_multi_threads
run: |
source ${VENV}/bin/activate
mpirun -np 2 pytest -k test_multi_threads[tensorflow-0] -v
mpirun -np 2 pytest -k test_multi_threads[tensorflow-1] -v
mpirun -np 2 pytest -k test_multi_threads[tensorflow-2] -v
mpirun -np 2 pytest -k test_multi_threads[pytorch-0] -v
mpirun -np 2 pytest -k test_multi_threads[pytorch-1] -v
mpirun -np 2 pytest -k test_multi_threads[pytorch-2] -v
mpirun -np 2 pytest -k test_multi_threads[tensorflow-0-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_multi_threads[tensorflow-1-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_multi_threads[tensorflow-2-dlio_tensorflow] -v
mpirun -np 2 pytest -k test_multi_threads[pytorch-0-dlio_pytorch] -v
mpirun -np 2 pytest -k test_multi_threads[pytorch-1-dlio_pytorch] -v
mpirun -np 2 pytest -k test_multi_threads[pytorch-2-dlio_pytorch] -v
- name: test-tf-loader-tfrecord
run: |
source ${VENV}/bin/activate
Expand All @@ -142,10 +150,10 @@ jobs:
- name: test-tf-loader-npz
run: |
source ${VENV}/bin/activate
mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=dlio_tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=dlio_tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
- name: test_subset
run: |
source ${VENV}/bin/activate
mpirun -np 2 dlio_benchmark ++workload.workflow.generate_data=True ++workload.workflow.train=False
mpirun -np 2 dlio_benchmark ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=8
mpirun -np 2 dlio_benchmark ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=8
73 changes: 49 additions & 24 deletions dlio_benchmark/common/enumerations.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,41 +89,66 @@ class FormatType(Enum):
"""
Format Type supported by the benchmark.
"""
TFRECORD = 'tfrecord'
HDF5 = 'hdf5'
CSV = 'csv'
NPZ = 'npz'
HDF5_OPT = 'hdf5_opt'
JPEG = 'jpeg'
PNG = 'png'
DLIO_TFRECORD = 'dlio_tfrecord'
DLIO_HDF5 = 'dlio_hdf5'
DLIO_CSV = 'dlio_csv'
DLIO_NPZ = 'dlio_npz'
DLIO_HDF5_OPT = 'dlio_hdf5_opt'
DLIO_JPEG = 'dlio_jpeg'
DLIO_PNG = 'dlio_png'
TF_TFRECORD = 'tf_tfrecord'
DALI_TFRECORD = 'dali_tfrecord'
DALI_NPZ = 'dali_npz'

def __str__(self):
return self.value

@staticmethod
def getextension(value):
if value in [FormatType.DLIO_TFRECORD.value,FormatType.DALI_TFRECORD.value] :
return "tfrecord"
elif FormatType.DLIO_HDF5.value == value:
return "hdf5"
elif FormatType.DLIO_CSV.value == value:
return "csv"
elif value in [FormatType.DLIO_NPZ.value] :
return "npz"
elif value == FormatType.DALI_NPZ.value:
return "npy"
elif FormatType.DLIO_HDF5_OPT.value == value:
return "hdf5"
elif FormatType.DLIO_JPEG.value == value:
return "jpeg"
elif FormatType.DLIO_PNG.value == value:
return "png"

@ staticmethod
def get_enum(value):
if FormatType.TFRECORD.value == value:
return FormatType.TFRECORD
elif FormatType.HDF5.value == value:
return FormatType.HDF5
elif FormatType.CSV.value == value:
return FormatType.CSV
elif FormatType.NPZ.value == value:
return FormatType.NPZ
elif FormatType.HDF5_OPT.value == value:
return FormatType.HDF5_OPT
elif FormatType.JPEG.value == value:
return FormatType.JPEG
elif FormatType.PNG.value == value:
return FormatType.PNG
if FormatType.DLIO_TFRECORD.value == value:
return FormatType.DLIO_TFRECORD
elif FormatType.DLIO_HDF5.value == value:
return FormatType.DLIO_HDF5
elif FormatType.DLIO_CSV.value == value:
return FormatType.DLIO_CSV
elif FormatType.DLIO_NPZ.value == value:
return FormatType.DLIO_NPZ
elif FormatType.DLIO_HDF5_OPT.value == value:
return FormatType.DLIO_HDF5_OPT
elif FormatType.DLIO_JPEG.value == value:
return FormatType.DLIO_JPEG
elif FormatType.DLIO_PNG.value == value:
return FormatType.DLIO_PNG

class DataLoaderType(Enum):
"""
Framework DataLoader Type
"""
TENSORFLOW='tensorflow'
PYTORCH='pytorch'
DALI='dali'
DLIO_TENSORFLOW='dlio_tensorflow'
DLIO_PYTORCH='dlio_pytorch'
DLIO_DALI='dlio_dali'
NATIVE_TENSORFLOW = 'native_tensorflow'
NATIVE_PYTORCH = 'native_pytorch'
NATIVE_DALI = 'native_dali'
CUSTOM='custom'
NONE='none'

Expand Down
4 changes: 2 additions & 2 deletions dlio_benchmark/configs/workload/bert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ workflow:

dataset:
data_folder: data/bert
format: tfrecord
format: dlio_tfrecord
num_files_train: 500
num_samples_per_file: 313532
record_length: 2500
Expand All @@ -22,7 +22,7 @@ train:
total_training_steps: 1000

reader:
data_loader: tensorflow
data_loader: dlio_tensorflow
read_threads: 1
computation_threads: 1
transfer_size: 262144
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/configs/workload/cosmoflow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dataset:


reader:
data_loader: tensorflow
data_loader: dlio_tensorflow
computation_threads: 8
read_threads: 8
batch_size: 1
Expand Down
4 changes: 2 additions & 2 deletions dlio_benchmark/configs/workload/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ workflow:

dataset:
data_folder: data/default
format: npz
format: dlio_npz
num_files_train: 64
num_files_eval: 8
num_samples_per_file: 1
Expand All @@ -19,7 +19,7 @@ dataset:
num_subfolders_eval: 2

reader:
data_loader: pytorch
data_loader: dlio_pytorch
batch_size: 4
batch_size_eval: 1

Expand Down
4 changes: 2 additions & 2 deletions dlio_benchmark/configs/workload/resnet50.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ dataset:
num_samples_per_file: 1
record_length: 150528
data_folder: data/resnet50
format: jpeg
format: dlio_jpeg

train:
computation_time: 0.1

reader:
data_loader: pytorch
data_loader: dlio_pytorch
read_threads: 8
computation_threads: 8
4 changes: 2 additions & 2 deletions dlio_benchmark/configs/workload/unet3d.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@ workflow:

dataset:
data_folder: data/unet3d/
format: npz
format: dlio_npz
num_files_train: 168
num_samples_per_file: 1
record_length: 146600628
record_length_stdev: 68341808
record_length_resize: 2097152

reader:
data_loader: pytorch
data_loader: dlio_pytorch
batch_size: 4
read_threads: 4
file_shuffle: seed
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/data_generator/data_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(self):
self._file_list = None
self.num_subfolders_train = self._args.num_subfolders_train
self.num_subfolders_eval = self._args.num_subfolders_eval
self.format = self._args.format
self.format = self._args.format_ext
self.storage = StorageFactory().get_storage(self._args.storage_type, self._args.storage_root,
self._args.framework)
def get_dimension(self):
Expand Down
15 changes: 9 additions & 6 deletions dlio_benchmark/data_generator/generator_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,25 @@ def __init__(self):

@staticmethod
def get_generator(type):
if type == FormatType.TFRECORD:
if type in [FormatType.DLIO_TFRECORD, FormatType.DALI_TFRECORD, FormatType.TF_TFRECORD]:
from dlio_benchmark.data_generator.tf_generator import TFRecordGenerator
return TFRecordGenerator()
elif type == FormatType.HDF5:
elif type == FormatType.DLIO_HDF5:
from dlio_benchmark.data_generator.hdf5_generator import HDF5Generator
return HDF5Generator()
elif type == FormatType.CSV:
elif type == FormatType.DLIO_CSV:
from dlio_benchmark.data_generator.csv_generator import CSVGenerator
return CSVGenerator()
elif type == FormatType.NPZ:
elif type == FormatType.DLIO_NPZ:
from dlio_benchmark.data_generator.npz_generator import NPZGenerator
return NPZGenerator()
elif type == FormatType.JPEG:
elif type == FormatType.DALI_NPZ:
from dlio_benchmark.data_generator.npy_generator import NPYGenerator
return NPYGenerator()
elif type == FormatType.DLIO_JPEG:
from dlio_benchmark.data_generator.jpeg_generator import JPEGGenerator
return JPEGGenerator()
elif type == FormatType.PNG:
elif type == FormatType.DLIO_PNG:
from dlio_benchmark.data_generator.png_generator import PNGGenerator
return PNGGenerator()
else:
Expand Down
Loading