From 770e3d0a1acb7aeec09e17ced6386919adb0710a Mon Sep 17 00:00:00 2001 From: Fiona-MCW <70996026+fiona-gladwin@users.noreply.github.com> Date: Tue, 23 Apr 2024 22:56:41 +0530 Subject: [PATCH] Audio PR - rocAL Python changes for Audio Decoder (#120) * Audio Decoder Python Changes --------- Co-authored-by: root Co-authored-by: Swetha B S Co-authored-by: swetha097 <59434434+swetha097@users.noreply.github.com> Co-authored-by: swetha097 Co-authored-by: Swetha B S <> Co-authored-by: SundarRajan28 --- CHANGELOG.md | 3 + rocAL/include/loaders/circular_buffer.h | 6 +- rocAL/source/loaders/audio/audio_loader.cpp | 10 +- .../loaders/audio/audio_read_and_decode.cpp | 6 +- rocAL_pybind/amd/rocal/decoders.py | 25 +++ rocAL_pybind/amd/rocal/plugin/pytorch.py | 70 ++++++- rocAL_pybind/rocal_pybind.cpp | 4 + tests/python_api/README.md | 21 +++ tests/python_api/audio_unit_test.py | 176 ++++++++++++++++++ tests/python_api/parse_config.py | 10 + 10 files changed, 316 insertions(+), 15 deletions(-) create mode 100644 tests/python_api/audio_unit_test.py diff --git a/CHANGELOG.md b/CHANGELOG.md index b0ec173df4..8cbe49a377 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,9 @@ * Packages - dev & tests * Support for audio loader and decoder, which uses libsndfile library to decode wav files * C++ rocAL audio unit test and python script to run and compare the outputs +* Python support for audio decoders +* Pytorch iterator for Audio +* Python audio unit test, and support to verify outputs ### Optimizations diff --git a/rocAL/include/loaders/circular_buffer.h b/rocAL/include/loaders/circular_buffer.h index df30f68ee8..d39601ea98 100644 --- a/rocAL/include/loaders/circular_buffer.h +++ b/rocAL/include/loaders/circular_buffer.h @@ -37,9 +37,9 @@ struct DecodedDataInfo { std::vector _roi_height; std::vector _original_width; std::vector _original_height; - std::vector _original_audio_samples; //! Amplitude of an audio signal at a specific point in time - std::vector _original_audio_channels; //! Number of audio channels in an audio signal - std::vector _original_audio_sample_rates; //! The number of samples of audio carried per second + std::vector _audio_samples; //! Amplitude of an audio signal at a specific point in time + std::vector _audio_channels; //! Number of audio channels in an audio signal + std::vector _audio_sample_rates; //! The number of samples of audio carried per second }; struct CropImageInfo { diff --git a/rocAL/source/loaders/audio/audio_loader.cpp b/rocAL/source/loaders/audio/audio_loader.cpp index 0a169b6f5d..64a01247ac 100644 --- a/rocAL/source/loaders/audio/audio_loader.cpp +++ b/rocAL/source/loaders/audio/audio_loader.cpp @@ -135,9 +135,9 @@ void AudioLoader::initialize(ReaderConfig reader_cfg, DecoderConfig decoder_cfg, _max_decoded_samples = _output_tensor->info().max_shape().at(0); _max_decoded_channels = _output_tensor->info().max_shape().at(1); _decoded_audio_info._data_names.resize(_batch_size); - _decoded_audio_info._original_audio_samples.resize(_batch_size); - _decoded_audio_info._original_audio_channels.resize(_batch_size); - _decoded_audio_info._original_audio_sample_rates.resize(_batch_size); + _decoded_audio_info._audio_samples.resize(_batch_size); + _decoded_audio_info._audio_channels.resize(_batch_size); + _decoded_audio_info._audio_sample_rates.resize(_batch_size); _circ_buff.init(_mem_type, _output_mem_size, _prefetch_queue_depth); _is_initialized = true; LOG("Loader module initialized"); @@ -229,8 +229,8 @@ AudioLoader::update_output_audio() { return LoaderModuleStatus::OK; _output_decoded_audio_info = _circ_buff.get_decoded_data_info(); _output_names = _output_decoded_audio_info._data_names; - _output_tensor->update_tensor_roi(_output_decoded_audio_info._original_audio_samples, _output_decoded_audio_info._original_audio_channels); - _output_tensor->update_audio_tensor_sample_rate(_output_decoded_audio_info._original_audio_sample_rates); + _output_tensor->update_tensor_roi(_output_decoded_audio_info._audio_samples, _output_decoded_audio_info._audio_channels); + _output_tensor->update_audio_tensor_sample_rate(_output_decoded_audio_info._audio_sample_rates); _circ_buff.pop(); if (!_loop) _remaining_audio_count -= _batch_size; diff --git a/rocAL/source/loaders/audio/audio_read_and_decode.cpp b/rocAL/source/loaders/audio/audio_read_and_decode.cpp index 820dea7bec..2fbff4ac6c 100644 --- a/rocAL/source/loaders/audio/audio_read_and_decode.cpp +++ b/rocAL/source/loaders/audio/audio_read_and_decode.cpp @@ -126,9 +126,9 @@ AudioReadAndDecode::Load(float *audio_buffer, } for (size_t i = 0; i < _batch_size; i++) { audio_info._data_names[i] = _audio_meta_info[i].file_name; - audio_info._original_audio_samples[i] = _audio_meta_info[i].samples; - audio_info._original_audio_channels[i] = _audio_meta_info[i].channels; - audio_info._original_audio_sample_rates[i] = _audio_meta_info[i].sample_rate; + audio_info._audio_samples[i] = _audio_meta_info[i].samples; + audio_info._audio_channels[i] = _audio_meta_info[i].channels; + audio_info._audio_sample_rates[i] = _audio_meta_info[i].sample_rate; } } _decode_time.end(); // Debug timing diff --git a/rocAL_pybind/amd/rocal/decoders.py b/rocAL_pybind/amd/rocal/decoders.py index 2c47414534..83d27c0352 100644 --- a/rocAL_pybind/amd/rocal/decoders.py +++ b/rocAL_pybind/amd/rocal/decoders.py @@ -402,3 +402,28 @@ def image_slice(*inputs, file_root='', path='', annotations_file='', shard_id=0, image_decoder_slice = b.fusedDecoderCropShard( Pipeline._current_pipeline._handle, *(kwargs_pybind.values())) return (image_decoder_slice) + +def audio(*inputs, file_root='', file_list_path='', shard_id=0, num_shards=1, random_shuffle=False, downmix=False, stick_to_shard=False, shard_size=-1): + """!Decodes wav audio files. + + @param inputs List of input audio. + @param file_root Folder Path to the audio data. + @param file_list_path (for future use) Path to the text file containing list of files and the labels + @param shard_id Shard ID for parallel processing. + @param num_shards Total number of shards for parallel processing. + @param random_shuffle Whether to shuffle audio samples randomly. + @param downmix Converts the audio data to single channel when enabled + @param stick_to_shard The reader sticks to the data for it's corresponding shard when enabled + @param shard_size Provides the number of files in an epoch of a particular shard. + @return Decoded audio. + """ + kwargs_pybind = { + "source_path": file_root, + "shard_id": shard_id, + "num_shards": num_shards, + "is_output": False, + "shuffle": random_shuffle, + "loop": False, + "downmix": downmix} + decoded_audio = b.audioDecoderSingleShard(Pipeline._current_pipeline._handle, *(kwargs_pybind.values())) + return decoded_audio diff --git a/rocAL_pybind/amd/rocal/plugin/pytorch.py b/rocAL_pybind/amd/rocal/plugin/pytorch.py index 82a029b7cc..b35181d4f8 100644 --- a/rocAL_pybind/amd/rocal/plugin/pytorch.py +++ b/rocAL_pybind/amd/rocal/plugin/pytorch.py @@ -221,7 +221,7 @@ def __del__(self): class ROCALClassificationIterator(ROCALGenericIterator): - """!ROCAL iterator for classification tasks for PyTorch. It returns 2 outputs + """!rocAL iterator for classification tasks for PyTorch. It returns 2 outputs (data and label) in the form of PyTorch's Tensors. Calling @@ -237,15 +237,15 @@ class ROCALClassificationIterator(ROCALGenericIterator): ROCALGenericIterator(pipelines, ["data", "label"], size) Please keep in mind that Tensors returned by the iterator are - still owned by ROCAL. They are valid till the next iterator call. + still owned by rocAL. They are valid till the next iterator call. If the content needs to be preserved please copy it to another tensor. pipelines (list of amd.rocal.pipeline.Pipeline) List of pipelines to use size (int) Number of samples in the epoch (Usually the size of the dataset). auto_reset (bool, optional, default = False) Whether the iterator resets itself for the next epoch or it requires reset() to be called separately. fill_last_batch (bool, optional, default = True) Whether to fill the last batch with data up to 'self.batch_size'. The iterator would return the first integer multiple of self._num_gpus * self.batch_size entries which exceeds 'size'. Setting this flag to False will cause the iterator to return exactly 'size' entries. - dynamic_shape (bool, optional, default = False) Whether the shape of the output of the ROCAL pipeline can change during execution. If True, the pytorch tensor will be resized accordingly if the shape of ROCAL returned tensors changes during execution. If False, the iterator will fail in case of change. - last_batch_padded (bool, optional, default = False) Whether the last batch provided by ROCAL is padded with the last sample or it just wraps up. In the conjunction with fill_last_batch it tells if the iterator returning last batch with data only partially filled with data from the current epoch is dropping padding samples or samples from the next epoch. If set to False next epoch will end sooner as data from it was consumed but dropped. If set to True next epoch would be the same length as the first one. + dynamic_shape (bool, optional, default = False) Whether the shape of the output of the rocAL pipeline can change during execution. If True, the pytorch tensor will be resized accordingly if the shape of rocAL returned tensors changes during execution. If False, the iterator will fail in case of change. + last_batch_padded (bool, optional, default = False) Whether the last batch provided by rocAL is padded with the last sample or it just wraps up. In the conjunction with fill_last_batch it tells if the iterator returning last batch with data only partially filled with data from the current epoch is dropping padding samples or samples from the next epoch. If set to False next epoch will end sooner as data from it was consumed but dropped. If set to True next epoch would be the same length as the first one. Example ------- @@ -270,6 +270,68 @@ def __init__(self, super(ROCALClassificationIterator, self).__init__(pipe, tensor_layout=pipe._tensor_layout, tensor_dtype=pipe._tensor_dtype, multiplier=pipe._multiplier, offset=pipe._offset, display=display, device=device, device_id=device_id) +class ROCALAudioIterator(object): + """! rocAL iterator for audio tasks for PyTorch + The Tensors that are returned by the iterator will be owned by rocAL and would be valid until next iteration. + @param pipeline The rocAL pipeline to use for processing data. + @param tensor_dtype Data type of the output tensors. + @param size Number of samples in the epoch (Usually the size of the dataset). + @param auto_reset Whether the iterator resets itself for the next epoch or it requires reset() to be called separately. + @param device The device to use for processing - CPU / GPU + @param device_id The ID of the device to use + """ + def __init__(self, pipeline, tensor_dtype = types.FLOAT, size = -1, auto_reset = False, device = "cpu", device_id = 0): + self.loader = pipeline + self.device = device + self.device_id = device_id + self.output = None + self.iterator_length = b.getRemainingImages(self.loader._handle) # To change the name of getRemainingImages to getRemainingSamples in upcoming PRs + self.max_shape = None + self.batch_size = self.loader._batch_size + self.output_list = None + self.labels_size = self.batch_size + self.output_memory_type = self.loader._output_memory_type + + def next(self): + return self.__next__() + + def __next__(self): + if self.loader.rocal_run() != 0: + raise StopIteration + else: + self.output_tensor_list = self.loader.get_output_tensors() + # Output list used to store pipeline outputs - can support multiple augmentation outputs + self.output_list = [] + for i in range(len(self.output_tensor_list)): + dimensions = self.output_tensor_list[i].dimensions() + self.num_roi_dims = self.output_tensor_list[i].roi_dims_size() + self.roi_array = np.zeros(self.batch_size * self.num_roi_dims * 2, dtype=np.int32) + self.output_tensor_list[i].copy_roi(self.roi_array) + torch_dtype = self.output_tensor_list[i].dtype() + if self.device == "cpu": + output = torch.empty(dimensions, dtype=getattr(torch, torch_dtype)) + self.labels_tensor = torch.empty(self.labels_size, dtype=getattr(torch, torch_dtype)) + else: + torch_gpu_device = torch.device('cuda', self.device_id) + output = torch.empty(dimensions, dtype=getattr(torch, torch_dtype), device=torch_gpu_device) + self.labels_tensor = torch.empty(self.labels_size, dtype=getattr(torch, torch_dtype), device=torch_gpu_device) + + self.output_tensor_list[i].copy_data(ctypes.c_void_p(output.data_ptr()), self.output_memory_type) + self.output_list.append(output) + + return self.output_list, self.labels_tensor, torch.tensor(self.roi_array.reshape(self.batch_size,4)[...,2:4]) + + def reset(self): + b.rocalResetLoaders(self.loader._handle) + + def __iter__(self): + return self + + def __len__(self): + return self.iterator_length + + def __del__(self): + b.rocalRelease(self.loader._handle) def draw_patches(img, idx, bboxes): """!Writes images to disk as a PNG file. diff --git a/rocAL_pybind/rocal_pybind.cpp b/rocAL_pybind/rocal_pybind.cpp index 9b4d323470..3e9c4b38bc 100644 --- a/rocAL_pybind/rocal_pybind.cpp +++ b/rocAL_pybind/rocal_pybind.cpp @@ -638,6 +638,10 @@ PYBIND11_MODULE(rocal_pybind, m) { py::return_value_policy::reference); m.def("externalSourceFeedInput", &wrapperRocalExternalSourceFeedInput, py::return_value_policy::reference); + m.def("audioDecoderSingleShard", &rocalAudioFileSourceSingleShard, "Reads file from the source given and decodes it", + py::return_value_policy::reference); + m.def("audioDecoder", &rocalAudioFileSource, "Reads file from the source given and decodes it", + py::return_value_policy::reference); m.def("rocalResetLoaders", &rocalResetLoaders); m.def("videoMetaDataReader", &rocalCreateVideoLabelReader, py::return_value_policy::reference); // rocal_api_augmentation.h diff --git a/tests/python_api/README.md b/tests/python_api/README.md index 83506809a1..44cc28d58b 100644 --- a/tests/python_api/README.md +++ b/tests/python_api/README.md @@ -119,3 +119,24 @@ This test runs a pipeline making use of the external source reader in 3 differen ```bash python3 external_source_reader.py ``` + +## Audio Unit Test + +To run the Audio unit test with all test cases. Follow the steps below + +```bash +export ROCAL_DATA_PATH= +``` +To run the audio unit test and verify the correctness of the outputs + +```bash +python3 audio_unit_test.py +``` +To pass the audio data path, batch size, and run a particular test case use the following command + +```bash +python3 audio_unit_test.py --audio_path= --test_case --batch-size +``` + +**Available Test Cases** +* Case 0 - Audio Decoder \ No newline at end of file diff --git a/tests/python_api/audio_unit_test.py b/tests/python_api/audio_unit_test.py new file mode 100644 index 0000000000..745dc4278b --- /dev/null +++ b/tests/python_api/audio_unit_test.py @@ -0,0 +1,176 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +from amd.rocal.pipeline import pipeline_def +from amd.rocal.plugin.pytorch import ROCALAudioIterator +import amd.rocal.fn as fn +import random +import os +import sys +import matplotlib.pyplot as plt +import torch +import numpy as np +import timeit +from parse_config import parse_args + +np.set_printoptions(threshold=1000, edgeitems=10000) +seed = random.SystemRandom().randint(0, 2**32 - 1) + +test_case_augmentation_map = { + 0: "audio_decoder", +} + +def plot_audio_wav(audio_tensor, idx): + # audio is expected as a tensor + audio_data = audio_tensor.detach().numpy() + audio_data = audio_data.flatten() + plt.plot(audio_data) + plt.savefig("output_folder/audio_reader/" + str(idx) + ".png") + plt.close() + +def verify_output(audio_tensor, rocal_data_path, roi_tensor, test_results, case_name): + ref_path = f'{rocal_data_path}/rocal_data/GoldenOutputsTensor/reference_outputs_audio/{case_name}_output.bin' + data_array = np.fromfile(ref_path, dtype=np.float32) + audio_data = audio_tensor.detach().numpy().flatten() + roi_data = roi_tensor.detach().numpy() + matched_indices = 0 + for j in range(roi_data[0]): + ref_val = data_array[j] + out_val = audio_data[j] + # ensuring that out_val is not exactly zero while ref_val is non-zero. + invalid_comparison = (out_val == 0.0) and (ref_val != 0.0) + #comparing the absolute difference between the output value (out_val) and the reference value (ref_val) with a tolerance threshold of 1e-20. + if not invalid_comparison and np.abs(out_val - ref_val) < 1e-20: + matched_indices += 1 + + # Print results + print(f"Results for {case_name}:") + if matched_indices == roi_data[0] and matched_indices != 0: + print("PASSED!") + test_results[case_name] = "PASSED" + else: + print("FAILED!") + test_results[case_name] = "FAILED" + +@pipeline_def(seed=seed) +def audio_decoder_pipeline(path): + audio, labels = fn.readers.file(file_root=path) + return fn.decoders.audio( + audio, + file_root=path, + downmix=False, + shard_id=0, + num_shards=1, + stick_to_shard=False) + +def main(): + args = parse_args() + + audio_path = args.audio_path + rocal_cpu = False if args.rocal_gpu else True + batch_size = args.batch_size + test_case = args.test_case + qa_mode = args.qa_mode + num_threads = 1 + device_id = 0 + rocal_data_path = os.environ.get("ROCAL_DATA_PATH") + + case_list = list(test_case_augmentation_map.keys()) + + if test_case is not None: + if test_case not in case_list: + print(" Invalid Test Case! ") + exit() + else: + case_list = [test_case] + + if args.display: + try: + path = "output_folder/audio_reader" + isExist = os.path.exists(path) + if not isExist: + os.makedirs(path) + except OSError as error: + print(error) + + if rocal_data_path is None: + print("Need to export ROCAL_DATA_PATH") + sys.exit() + if not rocal_cpu: + print("The GPU support for Audio is not given yet. Running on CPU") + rocal_cpu = True + if audio_path == "": + audio_path = f'{rocal_data_path}/rocal_data/audio/wav/' + else: + print("QA mode is disabled for custom audio data") + qa_mode = 0 + if qa_mode and batch_size != 1: + print("QA mode is enabled. Batch size is set to 1.") + batch_size = 1 + + print("*********************************************************************") + test_results = {} + for case in case_list: + case_name = test_case_augmentation_map.get(case) + if case_name == "audio_decoder": + audio_pipeline = audio_decoder_pipeline(batch_size=batch_size, num_threads=num_threads, device_id=device_id, rocal_cpu=rocal_cpu, path=audio_path) + audio_pipeline.build() + audio_loader = ROCALAudioIterator(audio_pipeline, auto_reset=True) + cnt = 0 + start = timeit.default_timer() + # Enumerate over the Dataloader + for e in range(int(args.num_epochs)): + print("Epoch :: ", e) + torch.set_printoptions(threshold=5000, profile="full", edgeitems=100) + for i, it in enumerate(audio_loader): + for x in range(len(it[0])): + for audio_tensor, label, roi in zip(it[0][x], it[1], it[2]): + if args.print_tensor: + print("label", label) + print("Audio", audio_tensor) + print("Roi", roi) + if args.display: + plot_audio_wav(audio_tensor, cnt) + cnt+=1 + if qa_mode : + verify_output(audio_tensor, rocal_data_path, roi, test_results, case_name) + print("EPOCH DONE", e) + + stop = timeit.default_timer() + print('\nTime: ', stop - start) + + if qa_mode: + passed_cases = [] + failed_cases = [] + + for augmentation_name, result in test_results.items(): + if result == "PASSED": + passed_cases.append(augmentation_name) + else: + failed_cases.append(augmentation_name) + + print("Number of PASSED tests:", len(passed_cases)) + print(passed_cases) + print("Number of FAILED tests:", len(failed_cases)) + print(failed_cases) + + +if __name__ == "__main__": + main() diff --git a/tests/python_api/parse_config.py b/tests/python_api/parse_config.py index ea919369e2..9ff1692e27 100644 --- a/tests/python_api/parse_config.py +++ b/tests/python_api/parse_config.py @@ -105,6 +105,16 @@ def parse_args(): help='interpolation type used for resize and crop') python_unit_test.add_argument('--scaling-mode', '-sm', type=int, default=0, help='scaling mode type used for resize') + + # audio_unittests.py related options + audio_unit_test = parser.add_argument_group( + 'audio-python-unittest', 'audio-python-unittest-related options') + audio_unit_test.add_argument('--audio_path', type=str, default="", + help='audio files path') + audio_unit_test.add_argument('--test_case', type=int, default=0, + help='test case') + audio_unit_test.add_argument('--qa_mode', type=int, default=1, + help='enable qa mode to compare audio output with ref outputs') # coco_reader.py related options coco_reader = parser.add_argument_group( 'coco-pipeline', 'coco-pipeline-related options')