mila-iqia · Delaunay · Oct 6, 2022 · Oct 18, 2022 · Oct 18, 2022 · Oct 20, 2022
diff --git a/benchmarks/fsbench/README.md b/benchmarks/fsbench/README.md
@@ -0,0 +1,4 @@
+
+# Benchmark
+
+Rewrite this README to explain what the benchmark is!
diff --git a/benchmarks/fsbench/benchfile.py b/benchmarks/fsbench/benchfile.py
@@ -0,0 +1,37 @@
+from milabench.pack import Package
+
+
+class TheBenchmark(Package):
+    # Requirements file installed by install(). It can be empty or absent.
+    requirements_file = "requirements.txt"
+
+    # The preparation script called by prepare(). It must be executable,
+    # but it can be any type of script. It can be empty or absent.
+    prepare_script = "prepare.py"
+
+    # The main script called by run(). It must be a Python file. It has to
+    # be present.
+    main_script = "main.py"
+
+    # You can remove the functions below if you don't need to modify them.
+
+    def make_env(self):
+        # Return a dict of environment variables for prepare_script and
+        # main_script.
+        return super().make_env()
+
+    def install(self):
+        super().install()  # super() call installs the requirements
+
+    def prepare(self):
+        super().prepare()  # super() call executes prepare_script
+
+    def run(self, args, voirargs, env):
+        # You can insert new arguments to args/voirargs or change the env,
+        # although changing the env is a bit simpler if you modify make_env
+        return super().run(args, voirargs, env)
+        # Note: run() must return a running process, so make sure not to lose
+        # the return value of super() here.
+
+
+__pack__ = TheBenchmark
diff --git a/benchmarks/fsbench/dev.yaml b/benchmarks/fsbench/dev.yaml
@@ -0,0 +1,46 @@
+defaults: &defaults
+  dirs:
+    code: ./_dev_fsbench
+    venv: venv/fsbench
+    data: data
+    runs: runs
+
+  venv:
+    type: virtualenv
+
+  plan:
+    method: njobs
+    n: 1
+
+  voir:
+    --forward:
+      - "#stdout"
+      - "#stderr"
+      - "#end"
+      - train_rate
+      - progress
+      - total
+    --train-rate: true
+
+  definition: ./
+
+benchmarks:
+  raw-fs:
+    <<<: *defaults
+    argv:
+      --load-type: fs
+
+  squash-fs:
+    <<<: *defaults
+    argv:
+      --load-type: squash
+
+  deeplake-fs:
+    <<<: *defaults
+    argv:
+      --load-type: deeplake
+
+  decode-fs:
+    <<<: *defaults
+    argv:
+      --load-type: decode
diff --git a/benchmarks/fsbench/main.py b/benchmarks/fsbench/main.py
@@ -0,0 +1,121 @@
+import argparse
+import os
+
+from giving import give
+from torch.utils.data import DataLoader, Dataset
+from torchvision.datasets import DatasetFolder, ImageFolder
+from torchvision.transforms import ToTensor
+
+
+class SqhDataset(Dataset):
+    def __init__(self, root):
+        self.root = root
+        labels = list(root)
+        self.files = []
+        for label in labels:
+            self.files.extend(map(lambda p: label + b"/" + p, root.cd(label)))
+
+    def __len__(self):
+        return len(self.files)
+
+    def __getitem__(self, idx):
+        path = self.files[idx]
+        label = int(path.split(b"/")[0])
+        return len(self.root.open(path, binary=True, buffering=0).readall()), label
+
+
+def _ld(path):
+    with open(path, "rb") as f:
+        return len(f.read())
+
+
+def make_loader(path, sub, shuffle, batch_size, loading_processes, load_type):
+    if load_type == "fs":
+        load = DatasetFolder(os.path.join(path, sub), loader=_ld, extensions=(".jpeg",))
+    elif load_type == "squash":
+        from pysquash import SquashCursor
+
+        load = SqhDataset(SquashCursor(path + ".sqh").cd(sub.encode("utf-8")))
+    elif load_type == "deeplake":
+        import deeplake
+        return deeplake.load(path + ".lake")[sub].pytorch(batch_size=batch_size, shuffle=shuffle, num_workers=loading_processes, tensors=["images", "labels"], return_index=False, tobytes=True)
+    elif load_type == "decode":
+        load = ImageFolder(path, transform=ToTensor())
+    else:
+        raise ValueError("unknown load_type")
+    return DataLoader(
+        load, batch_size=batch_size, shuffle=shuffle, num_workers=loading_processes
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Filesystem benchmarks")
+    parser.add_argument(
+        "--shuffle", type=bool, default=False, help="read dataset in a random order"
+    )
+    parser.add_argument("--batch-size", type=int, default=64)
+    parser.add_argument(
+        "--epochs-valid",
+        type=int,
+        default=10,
+        help="number of epochs between validations",
+    )
+    parser.add_argument(
+        "--iters", type=int, default=1, help="number of train/valid cycles to run"
+    )
+    parser.add_argument(
+        "--loading-processes",
+        type=int,
+        default=0,
+        help="number of external processes to use for loading (0 to disable)",
+    )
+    parser.add_argument(
+        "--load-type",
+        required=True,
+        choices=("fs", "squash", "deeplake", "decode"),
+        help="type of loading to test, 'fs' is raw filesystem with no decode, 'squash' is loading trough squashfile, and 'decode' if from the filesystem, but including image decode",
+    )
+
+    args = parser.parse_args()
+
+    data_directory = os.environ.get("MILABENCH_DIR_DATA", None)
+    dataset_dir = os.path.join(data_directory, "LargeFakeUniform")
+
+    train_loader = make_loader(
+        dataset_dir,
+        "train",
+        shuffle=args.shuffle,
+        batch_size=args.batch_size,
+        loading_processes=args.loading_processes,
+        load_type=args.load_type,
+    )
+    valid_loader = make_loader(
+        dataset_dir,
+        "val",
+        shuffle=False,
+        batch_size=args.batch_size,
+        loading_processes=args.loading_processes,
+        load_type=args.load_type,
+    )
+    test_loader = make_loader(
+        dataset_dir,
+        "test",
+        shuffle=False,
+        batch_size=args.batch_size,
+        loading_processes=args.loading_processes,
+        load_type=args.load_type,
+    )
+
+    for _ in range(args.iters):
+        for epoch in range(args.epochs_valid):
+            for inp, target in train_loader:
+                give(batch=inp, step=True)
+        for inp, target in valid_loader:
+            give(batch=inp, step=True)
+    for inp, target in test_loader:
+        give(batch=inp, step=True)
+
+if __name__ == "__main__":
+    # Note: The line `if __name__ == "__main__"` is necessary for milabench
+    # to recognize the entry point (it does some funky stuff to it).
+    main()
diff --git a/benchmarks/fsbench/manifest b/benchmarks/fsbench/manifest
@@ -0,0 +1,2 @@
+*
+!_dev_*
diff --git a/benchmarks/fsbench/prepare.py b/benchmarks/fsbench/prepare.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+import os
+
+from milabench.datasets.fake_images import generate_sets
+from milabench.fs import XPath
+import deeplake
+import numpy as np
+
+
+def make_deeplake_group(ds, folder, class_names):
+    group_name = os.path.basename(folder)
+
+    files_list = []
+    for dirpath, dirnames, filenames in os.walk(os.path.join(folder)):
+        for filename in filenames:
+            if filename == 'done':
+                continue
+            files_list.append(os.path.join(dirpath, filename))
+
+    with ds:
+        ds.create_group(group_name)
+        ds[group_name].create_tensor('images', htype='image', sample_compression='jpeg')
+        ds[group_name].create_tensor('labels', htype='class_label', class_names=class_names)
+        for f in files_list:
+            label_num = int(os.path.basename(os.path.dirname(f)))
+            ds[group_name].append({'images': deeplake.read(f), 'labels': np.uint16(label_num)})
+
+
+# adjust the size of the generated dataset (1 = ~2Gb)
+scale = 100
+if __name__ == "__main__":
+    # If you need the whole configuration:
+    # config = json.loads(os.environ["MILABENCH_CONFIG"])
+
+    data_directory = os.environ["MILABENCH_DIR_DATA"]
+    dest = os.path.join(data_directory, "LargeFakeUniform")
+
+    generate_sets(dest, {"train": 14000 * scale, "val": 500 * scale, "test": 500 * scale}, (3, 512, 512))
+
+    root = dest + '.lake'
+    sentinel = XPath(root + '-done')
+    if sentinel.exists():
+        print(f"{root} was already generated")
+    else:
+        ds = deeplake.empty(dest + '.lake')
+        class_names = [str(i) for i in range(1000)]
+        make_deeplake_group(ds, os.path.join(dest, 'train'), class_names)
+        make_deeplake_group(ds, os.path.join(dest, 'val'), class_names)
+        make_deeplake_group(ds, os.path.join(dest, 'test'), class_names)
+        sentinel.touch()
diff --git a/benchmarks/fsbench/requirements.txt b/benchmarks/fsbench/requirements.txt
@@ -0,0 +1,4 @@
+torch
+torchvision
+tqdm
+deeplake
diff --git a/benchmarks/fsbench/voirfile.py b/benchmarks/fsbench/voirfile.py
@@ -0,0 +1,27 @@
+# Import this to instrument the ArgumentParser, remove if no such thing
+from milabench.opt import instrument_argparse
+
+
+def instrument_probes(ov):
+    # Probe for the necessary data. More information here:
+    # https://breuleux.github.io/milabench/instrument.html#probing-for-milabench
+
+    yield ov.phases.load_script
+
+    # loss
+    ...
+
+    # batch + step
+    ...
+
+    # use_cuda
+    ...
+
+    # model
+    ...
+
+    # loader
+    ...
+
+    # batch + compute_start + compute_end
+    ...