From e9ffd372cf01c24d3f70eff8b53105cbf4a8f80b Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 6 Oct 2022 13:11:12 -0400 Subject: [PATCH 1/5] Initial working filesystem benchmark --- benchmarks/fsbench/README.md | 4 ++ benchmarks/fsbench/benchfile.py | 37 ++++++++++++++++ benchmarks/fsbench/dev.yaml | 36 ++++++++++++++++ benchmarks/fsbench/main.py | 66 +++++++++++++++++++++++++++++ benchmarks/fsbench/manifest | 2 + benchmarks/fsbench/prepare.py | 15 +++++++ benchmarks/fsbench/requirements.txt | 4 ++ benchmarks/fsbench/voirfile.py | 27 ++++++++++++ 8 files changed, 191 insertions(+) create mode 100644 benchmarks/fsbench/README.md create mode 100644 benchmarks/fsbench/benchfile.py create mode 100644 benchmarks/fsbench/dev.yaml create mode 100644 benchmarks/fsbench/main.py create mode 100644 benchmarks/fsbench/manifest create mode 100755 benchmarks/fsbench/prepare.py create mode 100644 benchmarks/fsbench/requirements.txt create mode 100644 benchmarks/fsbench/voirfile.py diff --git a/benchmarks/fsbench/README.md b/benchmarks/fsbench/README.md new file mode 100644 index 000000000..ebce72bf5 --- /dev/null +++ b/benchmarks/fsbench/README.md @@ -0,0 +1,4 @@ + +# Benchmark + +Rewrite this README to explain what the benchmark is! diff --git a/benchmarks/fsbench/benchfile.py b/benchmarks/fsbench/benchfile.py new file mode 100644 index 000000000..b8e047fbe --- /dev/null +++ b/benchmarks/fsbench/benchfile.py @@ -0,0 +1,37 @@ +from milabench.pack import Package + + +class TheBenchmark(Package): + # Requirements file installed by install(). It can be empty or absent. + requirements_file = "requirements.txt" + + # The preparation script called by prepare(). It must be executable, + # but it can be any type of script. It can be empty or absent. + prepare_script = "prepare.py" + + # The main script called by run(). It must be a Python file. It has to + # be present. + main_script = "main.py" + + # You can remove the functions below if you don't need to modify them. + + def make_env(self): + # Return a dict of environment variables for prepare_script and + # main_script. + return super().make_env() + + def install(self): + super().install() # super() call installs the requirements + + def prepare(self): + super().prepare() # super() call executes prepare_script + + def run(self, args, voirargs, env): + # You can insert new arguments to args/voirargs or change the env, + # although changing the env is a bit simpler if you modify make_env + return super().run(args, voirargs, env) + # Note: run() must return a running process, so make sure not to lose + # the return value of super() here. + + +__pack__ = TheBenchmark diff --git a/benchmarks/fsbench/dev.yaml b/benchmarks/fsbench/dev.yaml new file mode 100644 index 000000000..5e2b0763c --- /dev/null +++ b/benchmarks/fsbench/dev.yaml @@ -0,0 +1,36 @@ +defaults: &defaults + dirs: + code: ./_dev_{name} + venv: venv/{name} + data: data + runs: runs + + venv: + type: virtualenv + + plan: + method: njobs + n: 1 + + voir: + --stop: 10 + --forward: + - "#stdout" + - "#stderr" + - "#end" + - loss + - compute_rate + - train_rate + - loading_rate + - descr + - progress + - total + --compute-rate: true + --train-rate: true + --loading-rate: true + +benchmarks: + fsbench: + <<<: *defaults + + definition: ./ diff --git a/benchmarks/fsbench/main.py b/benchmarks/fsbench/main.py new file mode 100644 index 000000000..2a874e1b1 --- /dev/null +++ b/benchmarks/fsbench/main.py @@ -0,0 +1,66 @@ +from torchvision.datasets import DatasetFolder, ImageFolder +from torchvision.transforms import ToTensor +from torch.utils.data import DataLoader, Dataset +import os +from giving import give +#from pysquash import SquashCursor + +shuffle = False +batch_size = 64 +n_epoch_valid = 10 +n_iters = 5 +# number of process to use for loading (0 = don't use extra processes) +multiprocess_load = 0 + + +class SqhDataset(Dataset): + def __init__(self, root): + self.root = root + labels = list(root) + self.files = [] + for label in labels: + self.files.extend(map(lambda p: label + b'/' + p, root.cd(label))) + + def __len__(self): + return len(self.files) + + def __getitem__(self, idx): + path = self.files[idx] + label = int(path.split(b'/')[0]) + return len(self.root.open(path, binary=True, buffering=0).readall()), label + + +def _ld(path): + with open(path, 'rb') as f: + return len(f.read()) + + +def make_loader(path, sub, shuffle): + load = DatasetFolder(os.path.join(path, sub), loader=_ld, extensions=('.jpeg',)) + #load = SqhDataset(SquashCursor(path + ".sqh").cd(sub.encode('utf-8'))) + #load = ImageFolder(path, transform=ToTensor()) + return DataLoader(load, batch_size=batch_size, shuffle=shuffle, num_workers=multiprocess_load) + +def main(): + # Write code here + data_directory = os.environ.get("MILABENCH_DIR_DATA", None) + dataset_dir = os.path.join(data_directory, "LargeFakeUniform") + + train_loader = make_loader(dataset_dir, "train", shuffle=shuffle) + valid_loader = make_loader(dataset_dir, "val", shuffle=False) + test_loader = make_loader(dataset_dir, "test", shuffle=False) + + for iter in range(n_iters): + for epoch in range(n_epoch_valid): + for inp, target in train_loader: + give(batch=inp, step=True) + for inp, target in valid_loader: + give(batch=inp, step=True) + for inp, target in test_loader: + give(batch=inp, step=True) + + +if __name__ == "__main__": + # Note: The line `if __name__ == "__main__"` is necessary for milabench + # to recognize the entry point (it does some funky stuff to it). + main() diff --git a/benchmarks/fsbench/manifest b/benchmarks/fsbench/manifest new file mode 100644 index 000000000..2b686980e --- /dev/null +++ b/benchmarks/fsbench/manifest @@ -0,0 +1,2 @@ +* +!_dev_* diff --git a/benchmarks/fsbench/prepare.py b/benchmarks/fsbench/prepare.py new file mode 100755 index 000000000..6e31f6fcc --- /dev/null +++ b/benchmarks/fsbench/prepare.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python +import os + +from milabench.datasets.fake_images import generate_sets + + +scale = 1000 +if __name__ == "__main__": + # If you need the whole configuration: + # config = json.loads(os.environ["MILABENCH_CONFIG"]) + + data_directory = os.environ["MILABENCH_DIR_DATA"] + dest = os.path.join(data_directory, "LargeFakeUniform") + + generate_sets(dest, {"train": 14000 * scale, "val": 500 * scale, "test": 500 * scale}, (3, 512, 512)) diff --git a/benchmarks/fsbench/requirements.txt b/benchmarks/fsbench/requirements.txt new file mode 100644 index 000000000..fc97884c4 --- /dev/null +++ b/benchmarks/fsbench/requirements.txt @@ -0,0 +1,4 @@ +torch +torchvision +tqdm + diff --git a/benchmarks/fsbench/voirfile.py b/benchmarks/fsbench/voirfile.py new file mode 100644 index 000000000..179f60023 --- /dev/null +++ b/benchmarks/fsbench/voirfile.py @@ -0,0 +1,27 @@ +# Import this to instrument the ArgumentParser, remove if no such thing +from milabench.opt import instrument_argparse + + +def instrument_probes(ov): + # Probe for the necessary data. More information here: + # https://breuleux.github.io/milabench/instrument.html#probing-for-milabench + + yield ov.phases.load_script + + # loss + ... + + # batch + step + ... + + # use_cuda + ... + + # model + ... + + # loader + ... + + # batch + compute_start + compute_end + ... From b125a9ffa039fd2df24a9755a31de3be1e06a7f5 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 18 Oct 2022 12:24:17 -0400 Subject: [PATCH 2/5] add arguments to the fs tests and use them for multiple test cases --- benchmarks/fsbench/dev.yaml | 15 ++++-- benchmarks/fsbench/main.py | 103 +++++++++++++++++++++++++++--------- 2 files changed, 89 insertions(+), 29 deletions(-) diff --git a/benchmarks/fsbench/dev.yaml b/benchmarks/fsbench/dev.yaml index 5e2b0763c..0a83109fe 100644 --- a/benchmarks/fsbench/dev.yaml +++ b/benchmarks/fsbench/dev.yaml @@ -1,7 +1,7 @@ defaults: &defaults dirs: - code: ./_dev_{name} - venv: venv/{name} + code: ./_dev_fsbench + venv: venv/fsbench data: data runs: runs @@ -29,8 +29,15 @@ defaults: &defaults --train-rate: true --loading-rate: true + definition: ./ + benchmarks: - fsbench: + raw-fs: <<<: *defaults + argv: + --load-type: fs - definition: ./ + decode-fs: + <<<: *defaults + argv: + --load-type: decode diff --git a/benchmarks/fsbench/main.py b/benchmarks/fsbench/main.py index 2a874e1b1..c24795f25 100644 --- a/benchmarks/fsbench/main.py +++ b/benchmarks/fsbench/main.py @@ -1,16 +1,10 @@ -from torchvision.datasets import DatasetFolder, ImageFolder -from torchvision.transforms import ToTensor -from torch.utils.data import DataLoader, Dataset +import argparse import os -from giving import give -#from pysquash import SquashCursor -shuffle = False -batch_size = 64 -n_epoch_valid = 10 -n_iters = 5 -# number of process to use for loading (0 = don't use extra processes) -multiprocess_load = 0 +from giving import give +from torch.utils.data import DataLoader, Dataset +from torchvision.datasets import DatasetFolder, ImageFolder +from torchvision.transforms import ToTensor class SqhDataset(Dataset): @@ -19,39 +13,98 @@ def __init__(self, root): labels = list(root) self.files = [] for label in labels: - self.files.extend(map(lambda p: label + b'/' + p, root.cd(label))) + self.files.extend(map(lambda p: label + b"/" + p, root.cd(label))) def __len__(self): return len(self.files) def __getitem__(self, idx): path = self.files[idx] - label = int(path.split(b'/')[0]) + label = int(path.split(b"/")[0]) return len(self.root.open(path, binary=True, buffering=0).readall()), label def _ld(path): - with open(path, 'rb') as f: + with open(path, "rb") as f: return len(f.read()) -def make_loader(path, sub, shuffle): - load = DatasetFolder(os.path.join(path, sub), loader=_ld, extensions=('.jpeg',)) - #load = SqhDataset(SquashCursor(path + ".sqh").cd(sub.encode('utf-8'))) - #load = ImageFolder(path, transform=ToTensor()) - return DataLoader(load, batch_size=batch_size, shuffle=shuffle, num_workers=multiprocess_load) +def make_loader(path, sub, shuffle, batch_size, loading_processes, load_type): + if load_type == "fs": + load = DatasetFolder(os.path.join(path, sub), loader=_ld, extensions=(".jpeg",)) + elif load_type == "squash": + from pysquash import SquashCursor + + load = SqhDataset(SquashCursor(path + ".sqh").cd(sub.encode("utf-8"))) + elif load_type == "decode": + load = ImageFolder(path, transform=ToTensor()) + else: + raise ValueError("unknown load_type") + return DataLoader( + load, batch_size=batch_size, shuffle=shuffle, num_workers=loading_processes + ) + def main(): - # Write code here + parser = argparse.ArgumentParser(description="Filesystem benchmarks") + parser.add_argument( + "--shuffle", type=bool, default=False, help="read dataset in a random order" + ) + parser.add_argument("--batch-size", type=int, default=64) + parser.add_argument( + "--epochs-valid", + type=int, + default=10, + help="number of epochs between validations", + ) + parser.add_argument( + "--iters", type=int, default=5, help="number of train/valid cycles to run" + ) + parser.add_argument( + "--loading-processes", + type=int, + default=0, + help="number of external processes to use for loading (0 to disable)", + ) + parser.add_argument( + "--load-type", + required=True, + choices=("fs", "squash", "decode"), + help="type of loading to test, 'fs' is raw filesystem with no decode, 'squash' is loading trough squashfile, and 'decode' if from the filesystem, but including image decode", + ) + + args = parser.parse_args() + data_directory = os.environ.get("MILABENCH_DIR_DATA", None) dataset_dir = os.path.join(data_directory, "LargeFakeUniform") - train_loader = make_loader(dataset_dir, "train", shuffle=shuffle) - valid_loader = make_loader(dataset_dir, "val", shuffle=False) - test_loader = make_loader(dataset_dir, "test", shuffle=False) + train_loader = make_loader( + dataset_dir, + "train", + shuffle=args.shuffle, + batch_size=args.batch_size, + loading_processes=args.loading_processes, + load_type=args.load_type, + ) + valid_loader = make_loader( + dataset_dir, + "val", + shuffle=False, + batch_size=args.batch_size, + loading_processes=args.loading_processes, + load_type=args.load_type, + ) + test_loader = make_loader( + dataset_dir, + "test", + shuffle=False, + batch_size=args.batch_size, + loading_processes=args.loading_processes, + load_type=args.load_type, + ) - for iter in range(n_iters): - for epoch in range(n_epoch_valid): + for iter in range(args.iters): + for epoch in range(args.epochs_valid): for inp, target in train_loader: give(batch=inp, step=True) for inp, target in valid_loader: From 5f48624828f62bf486cfa76c7422c417aa540d2f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 18 Oct 2022 12:29:32 -0400 Subject: [PATCH 3/5] Set the dataset size to about 200gb --- benchmarks/fsbench/prepare.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/fsbench/prepare.py b/benchmarks/fsbench/prepare.py index 6e31f6fcc..f877990b0 100755 --- a/benchmarks/fsbench/prepare.py +++ b/benchmarks/fsbench/prepare.py @@ -4,7 +4,8 @@ from milabench.datasets.fake_images import generate_sets -scale = 1000 +# adjust the size of the generated dataset (1 = ~2Gb) +scale = 100 if __name__ == "__main__": # If you need the whole configuration: # config = json.loads(os.environ["MILABENCH_CONFIG"]) From 6974ac2b8539da53d47fadfe027d46a80679a076 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 20 Oct 2022 11:58:36 -0400 Subject: [PATCH 4/5] Make sure to compute the loading_rate and don't stop after 10 steps. --- benchmarks/fsbench/dev.yaml | 7 ------- benchmarks/fsbench/main.py | 12 +++++++----- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/benchmarks/fsbench/dev.yaml b/benchmarks/fsbench/dev.yaml index 0a83109fe..18a1702a5 100644 --- a/benchmarks/fsbench/dev.yaml +++ b/benchmarks/fsbench/dev.yaml @@ -13,20 +13,13 @@ defaults: &defaults n: 1 voir: - --stop: 10 --forward: - "#stdout" - "#stderr" - "#end" - - loss - - compute_rate - - train_rate - loading_rate - - descr - progress - total - --compute-rate: true - --train-rate: true --loading-rate: true definition: ./ diff --git a/benchmarks/fsbench/main.py b/benchmarks/fsbench/main.py index c24795f25..efc36b0eb 100644 --- a/benchmarks/fsbench/main.py +++ b/benchmarks/fsbench/main.py @@ -58,7 +58,7 @@ def main(): help="number of epochs between validations", ) parser.add_argument( - "--iters", type=int, default=5, help="number of train/valid cycles to run" + "--iters", type=int, default=1, help="number of train/valid cycles to run" ) parser.add_argument( "--loading-processes", @@ -103,15 +103,17 @@ def main(): load_type=args.load_type, ) + give(loader=train_loader) + give(loader=valid_loader) + give(loader=test_loader) for iter in range(args.iters): for epoch in range(args.epochs_valid): for inp, target in train_loader: - give(batch=inp, step=True) + pass for inp, target in valid_loader: - give(batch=inp, step=True) + pass for inp, target in test_loader: - give(batch=inp, step=True) - + pass if __name__ == "__main__": # Note: The line `if __name__ == "__main__"` is necessary for milabench From 356893976f1837a47e98f7667bb5f258797ba051 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 25 Oct 2022 16:34:50 -0400 Subject: [PATCH 5/5] rework measuring and add support for deeplake --- benchmarks/fsbench/dev.yaml | 14 ++++++++++-- benchmarks/fsbench/main.py | 16 +++++++------- benchmarks/fsbench/prepare.py | 34 +++++++++++++++++++++++++++++ benchmarks/fsbench/requirements.txt | 2 +- 4 files changed, 55 insertions(+), 11 deletions(-) diff --git a/benchmarks/fsbench/dev.yaml b/benchmarks/fsbench/dev.yaml index 18a1702a5..19c94ce5d 100644 --- a/benchmarks/fsbench/dev.yaml +++ b/benchmarks/fsbench/dev.yaml @@ -17,10 +17,10 @@ defaults: &defaults - "#stdout" - "#stderr" - "#end" - - loading_rate + - train_rate - progress - total - --loading-rate: true + --train-rate: true definition: ./ @@ -30,6 +30,16 @@ benchmarks: argv: --load-type: fs + squash-fs: + <<<: *defaults + argv: + --load-type: squash + + deeplake-fs: + <<<: *defaults + argv: + --load-type: deeplake + decode-fs: <<<: *defaults argv: diff --git a/benchmarks/fsbench/main.py b/benchmarks/fsbench/main.py index efc36b0eb..694edf687 100644 --- a/benchmarks/fsbench/main.py +++ b/benchmarks/fsbench/main.py @@ -36,6 +36,9 @@ def make_loader(path, sub, shuffle, batch_size, loading_processes, load_type): from pysquash import SquashCursor load = SqhDataset(SquashCursor(path + ".sqh").cd(sub.encode("utf-8"))) + elif load_type == "deeplake": + import deeplake + return deeplake.load(path + ".lake")[sub].pytorch(batch_size=batch_size, shuffle=shuffle, num_workers=loading_processes, tensors=["images", "labels"], return_index=False, tobytes=True) elif load_type == "decode": load = ImageFolder(path, transform=ToTensor()) else: @@ -69,7 +72,7 @@ def main(): parser.add_argument( "--load-type", required=True, - choices=("fs", "squash", "decode"), + choices=("fs", "squash", "deeplake", "decode"), help="type of loading to test, 'fs' is raw filesystem with no decode, 'squash' is loading trough squashfile, and 'decode' if from the filesystem, but including image decode", ) @@ -103,17 +106,14 @@ def main(): load_type=args.load_type, ) - give(loader=train_loader) - give(loader=valid_loader) - give(loader=test_loader) - for iter in range(args.iters): + for _ in range(args.iters): for epoch in range(args.epochs_valid): for inp, target in train_loader: - pass + give(batch=inp, step=True) for inp, target in valid_loader: - pass + give(batch=inp, step=True) for inp, target in test_loader: - pass + give(batch=inp, step=True) if __name__ == "__main__": # Note: The line `if __name__ == "__main__"` is necessary for milabench diff --git a/benchmarks/fsbench/prepare.py b/benchmarks/fsbench/prepare.py index f877990b0..103de650d 100755 --- a/benchmarks/fsbench/prepare.py +++ b/benchmarks/fsbench/prepare.py @@ -2,6 +2,28 @@ import os from milabench.datasets.fake_images import generate_sets +from milabench.fs import XPath +import deeplake +import numpy as np + + +def make_deeplake_group(ds, folder, class_names): + group_name = os.path.basename(folder) + + files_list = [] + for dirpath, dirnames, filenames in os.walk(os.path.join(folder)): + for filename in filenames: + if filename == 'done': + continue + files_list.append(os.path.join(dirpath, filename)) + + with ds: + ds.create_group(group_name) + ds[group_name].create_tensor('images', htype='image', sample_compression='jpeg') + ds[group_name].create_tensor('labels', htype='class_label', class_names=class_names) + for f in files_list: + label_num = int(os.path.basename(os.path.dirname(f))) + ds[group_name].append({'images': deeplake.read(f), 'labels': np.uint16(label_num)}) # adjust the size of the generated dataset (1 = ~2Gb) @@ -14,3 +36,15 @@ dest = os.path.join(data_directory, "LargeFakeUniform") generate_sets(dest, {"train": 14000 * scale, "val": 500 * scale, "test": 500 * scale}, (3, 512, 512)) + + root = dest + '.lake' + sentinel = XPath(root + '-done') + if sentinel.exists(): + print(f"{root} was already generated") + else: + ds = deeplake.empty(dest + '.lake') + class_names = [str(i) for i in range(1000)] + make_deeplake_group(ds, os.path.join(dest, 'train'), class_names) + make_deeplake_group(ds, os.path.join(dest, 'val'), class_names) + make_deeplake_group(ds, os.path.join(dest, 'test'), class_names) + sentinel.touch() diff --git a/benchmarks/fsbench/requirements.txt b/benchmarks/fsbench/requirements.txt index fc97884c4..6fe063905 100644 --- a/benchmarks/fsbench/requirements.txt +++ b/benchmarks/fsbench/requirements.txt @@ -1,4 +1,4 @@ torch torchvision tqdm - +deeplake