From c3d590b1464eeb44309f3bda8071dccf381145c4 Mon Sep 17 00:00:00 2001 From: RSKothari Date: Wed, 7 Apr 2021 10:55:10 +0400 Subject: [PATCH] Improved read me and added intructions in benchmark file --- .vscode/settings.json | 3 +++ README.md | 56 ++++++++++++++++++++++++++++++++++++------- args_maker.py | 10 ++++---- benchmark.py | 19 +++++++++++---- converter.py | 40 ++++++++++++++++++++++++++----- my_functions.py | 6 +++++ 6 files changed, 109 insertions(+), 25 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..fafb871 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "C:\\ProgramData\\Anaconda3\\python.exe" +} \ No newline at end of file diff --git a/README.md b/README.md index 95d6522..ef354bd 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,19 @@ + # Data2H5 This tool rapidly converts loose files scattered within any folder into a consolidated H5 file. This allows for faster read operations with lower memory requirement. **Reasoning**: H5 files consolidate data in contiguous memory sectors. + To learn more about how H5 files work, I refer the reader to this [fantastic article](https://www.oreilly.com/library/view/python-and-hdf5/9781491944981/ch04.html). +Features: +* H5 files speed up training +* Point and go - just give the paths! +* Utilizes all your cores to read data into H5 +* Easy steps to incorporate H5 file into data loader +* Allows reading custom data formats by providing your own file reading function +* Allows complex file pruning by supplying your own extension matching routine + ## Requirements `conda install -c anaconda h5py` @@ -11,6 +21,7 @@ To learn more about how H5 files work, I refer the reader to this [fantastic art `conda install -c conda-forge imageio` ## Commands + Suppose your loose files are spread across a folder. You can use this utility as such: `python converter --path_images= --path_output= --ext=jpg` @@ -21,7 +32,7 @@ For example: ## Operation -This script finds all files with the user specified extension within a folder. It uses the `os.walk` utility to find and read all valid data files. These data files can be extracted using their **relative path** from the folder they were extracted. +This script finds all files (images or otherwise) using the `os.walk` utility within a folder which matches the user specified extension. These data files are then consolidated into a single H5 file. Each file can be then be read directly from the H5 file using their **relative path** . For example: ``` @@ -37,24 +48,51 @@ h5_obj = h5py.File(path_h5, mode='r') data = h5_obj['foo/boo/goo/image_0001.jpg'][:] ``` -Note that `[:]` in the end reads `data` by value. The command preceding `[:]` is a reference to `data`. - ## Advantages - -Reading speed increases, lower memory consumption +* Easy to manage +* H5 files improve speed of reading operation +* Lowers memory consumption by leveraging lossless compression +* Partially loads data instead of hosting on RAM - convenient for large datasets +* Utilizes caching to further improve reading speeds when reading same samples again and again ## Custom data types -You can specify a custom data format by specifying `--ext=fancy_ext`. For example: +You can specify a custom file extension by specifying `--ext=fancy_ext`. For example: `python converter --path_images= --path_output= --ext=json --custom_read_func` -You may then add your own custom reading logic in `my_functions.py` in the function `my_read`. To ensure the program reads your custom logic function, please be sure to add a flag `--custom_read_func` which indicates the same. +You may then add your own custom reading logic in `my_functions.py` in the function `my_read`. To ensure the program reads your custom read function, please add the flag `--custom_read_func` which tells the script to ignore the default reader. -## Data loader tricks +## Custom extension pruning! -Coming soon! +You can provide your own file extension matching function in `my_prune` with the template provided by `--ext` flag. For example, if you want to match complex file extensions such as `.FoO0345` with a template extension string `foo`, then you can supply the following code as your own custom prune function. + +``` +def my_prune(filename_str, ext_str): + # Logic to verify if the extension type is present + # within the filename + return True if ext_str in filename_str.lower() else False +``` + +## Data loader setup + +To leverage H5 files into your training data loader, please refer to `benchmark.py`. There are three easy steps to follow: +* Step 1. Generate a list of all files used during training in the `init` function. +``` +with h5py.File(path_h5, 'r') as h5_obj: + self.file_list = list(h5_obj.keys()) # Each key is the relative path to file +``` +* Step 2. Open the `H5` reader object **within** the `__getitem__` call. This creates a separate reader object for each individual worker. +``` +if not hasattr(self, 'h5_obj'): + self.h5_obj = h5py.File(self.path_h5, mode='r', swmr=True) +``` +* Step 3. Add a safe closing operation for the H5 file. +``` +def __del__(self, ): + self.h5_obj.close() +``` ## Benchmarks Coming soon! diff --git a/args_maker.py b/args_maker.py index fdb3e46..73e2f76 100644 --- a/args_maker.py +++ b/args_maker.py @@ -22,12 +22,14 @@ def make_args(): help='specify png or jpg to select ext. default: jpg') parser.add_argument('--custom_read_func', action='store_true', help='use custom read function? ') + parser.add_argument('--custom_prune_func', action='store_true', + help='use custom extension matching function') required_args = parser.add_argument_group('required named arguments') - required_args.add_argument('--path_images', required=True, + required_args.add_argument('--path_images', required=False, help='abs path to image directory', default='D:/Datasets/Gaze360/imgs') - required_args.add_argument('--path_output', required=True, + required_args.add_argument('--path_output', required=False, help='abs path to output H5 file', default='D:/exp.h5') args = parser.parse_args() @@ -37,8 +39,4 @@ def make_args(): args.path_images = os.path.abspath(args.path_images) args.path_output = os.path.abspath(args.path_output) - # Delete and create a new H5 file - h5_obj = h5py.File(args.path_output, 'w') - h5_obj.close() - return args diff --git a/benchmark.py b/benchmark.py index 882f810..5a4b2d9 100644 --- a/benchmark.py +++ b/benchmark.py @@ -16,6 +16,7 @@ def join_path(path_left, path_right): + # Join two paths return os.path.join(path_left, path_right) @@ -26,11 +27,9 @@ def __init__(self, path_h5, path_folder, mode='H5'): self.path_folder = path_folder self.read_from_H5 = True if mode == 'H5' else False + # Step #1: Generate a list of all images with h5py.File(path_h5, 'r') as h5_obj: self.file_list = list(h5_obj.keys()) - - def __del__(self, ): - self.h5_obj.close() def __len__(self, ): return len(self.file_list) @@ -40,16 +39,27 @@ def __getitem__(self, idx): entry_str = self.file_list[idx] if self.read_from_H5: + # Step #2: Create a H5 object within the __getitem__ call + # This creates a H5 reader object for each worker. if not hasattr(self, 'h5_obj'): self.h5_obj = h5py.File(self.path_h5, mode='r', swmr=True) + + # Reading a datum from the H5 file datum = self.h5_obj[entry_str][:] else: + # Read a datum from the physical location path_file = join_path(self.path_folder, entry_str) datum = cv2.imread(os.path.abspath(path_file)) + + # An operation to facilitate benchmarking datum = cv2.resize(datum, (224, 244), interpolation=cv2.INTER_LANCZOS4) return datum + + # Step #3: Add a safe closing operation if the loader is unoperational + def __del__(self, ): + self.h5_obj.close() if __name__ == '__main__': @@ -58,8 +68,9 @@ def __getitem__(self, idx): bench_obj = benchmark(args['path_output'], args['path_images']) loader = torch.utils.data.DataLoader(bench_obj, + shuffle=True, batch_size=48, - num_workers=4) + num_workers=0) for epoch in range(3): time_elapsed = [] diff --git a/converter.py b/converter.py index e44fa99..502b1a0 100644 --- a/converter.py +++ b/converter.py @@ -19,13 +19,20 @@ def join_path(path_left, path_right): class capture_within_H5(): + ''' + A class to find all files with the given extension and store them + into a H5 file. + ''' def __init__(self, args): - self.args_dict = vars(args) + # Initialize the converter and generate a list of valid files + self.args_dict = args root_dir_file_list = self.create_tree() list_of_valid_datum = self.prune_tree(root_dir_file_list) self.list_of_valid_datum = self.generate_full_path(list_of_valid_datum) def generate_full_path(self, paths): + # Given a list of os.walk tuples, append the root path + # to all individual relative paths full_paths = [] for path in paths: vals = [join_path(path[0], ele) for ele in path[1]] @@ -33,11 +40,14 @@ def generate_full_path(self, paths): return list(itertools.chain(*full_paths)) def create_tree(self): + # Generate a tree of a files in a given directory return list(os.walk(self.args_dict['path_images'])) def prune_tree(self, tree): + # Prune a tree by only accepting files which match the + # provided file extension list_of_valid_datum = [] - for (dir_, dirs, files) in tree: + for (dir_, _, files) in tree: valid_files = self.prune_files(files) if any(valid_files): rel_root = os.path.relpath(dir_, self.args_dict['path_images']) @@ -46,7 +56,12 @@ def prune_tree(self, tree): def prune_files(self, files): if any(files): - return [fi for fi in files if fi.endswith(self.args_dict['ext'])] + if self.args_dict['custom_prune_func']: + return [fi for fi in files + if self.default_prune(fi, self.args_dict['ext'])] + else: + return [fi for fi in files + if my_prune(fi, self.args_dict['ext'])] else: return [] @@ -93,7 +108,6 @@ def read_write(self, ): def read_function(self, path_sample): if self.args_dict['custom_read_func']: - from my_functions import my_read data = my_read(path_sample) else: data = self.default_reader(path_sample) @@ -106,18 +120,32 @@ def read_function(self, path_sample): return True else: - return data def default_reader(self, path_sample): + # Read operation for jpg or png images path_image = join_path(self.args_dict['path_images'], path_sample) image = imageio.imread(path_image) return path_sample, image + + def default_prune(self, filename_str, ext_str): + # Given an input file name, return true if they match else False + return filename_str.endswith(ext_str) if __name__ == '__main__': - args = make_args() + args = vars(make_args()) obj = capture_within_H5(args) + + if args['custom_read_func']: + from my_functions import my_read + + if args['custom_prune_func']: + from my_functions import my_prune + + # %% Delete and create a new H5 file + h5_obj = h5py.File(args.path_output, 'w') + h5_obj.close() # %% Begin reading and writing to H5 obj.read_write() diff --git a/my_functions.py b/my_functions.py index cb6611c..5eb0f29 100644 --- a/my_functions.py +++ b/my_functions.py @@ -12,5 +12,11 @@ def my_read(path_sample): return path_sample, datum +def my_prune(filename_str, ext_str): + # Logic to verify if the extension type is present + # within the filename + return True + + if __name__ == '__main__': print('Entry point script is converter.py. Follow instructions in README.')