Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fsbench #129

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions benchmarks/fsbench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

# Benchmark

Rewrite this README to explain what the benchmark is!
37 changes: 37 additions & 0 deletions benchmarks/fsbench/benchfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from milabench.pack import Package


class TheBenchmark(Package):
# Requirements file installed by install(). It can be empty or absent.
requirements_file = "requirements.txt"

# The preparation script called by prepare(). It must be executable,
# but it can be any type of script. It can be empty or absent.
prepare_script = "prepare.py"

# The main script called by run(). It must be a Python file. It has to
# be present.
main_script = "main.py"

# You can remove the functions below if you don't need to modify them.

def make_env(self):
# Return a dict of environment variables for prepare_script and
# main_script.
return super().make_env()

def install(self):
super().install() # super() call installs the requirements

def prepare(self):
super().prepare() # super() call executes prepare_script

def run(self, args, voirargs, env):
# You can insert new arguments to args/voirargs or change the env,
# although changing the env is a bit simpler if you modify make_env
return super().run(args, voirargs, env)
# Note: run() must return a running process, so make sure not to lose
# the return value of super() here.


__pack__ = TheBenchmark
46 changes: 46 additions & 0 deletions benchmarks/fsbench/dev.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
defaults: &defaults
dirs:
code: ./_dev_fsbench
venv: venv/fsbench
data: data
runs: runs

venv:
type: virtualenv

plan:
method: njobs
n: 1

voir:
--forward:
- "#stdout"
- "#stderr"
- "#end"
- train_rate
- progress
- total
--train-rate: true

definition: ./

benchmarks:
raw-fs:
<<<: *defaults
argv:
--load-type: fs

squash-fs:
<<<: *defaults
argv:
--load-type: squash

deeplake-fs:
<<<: *defaults
argv:
--load-type: deeplake

decode-fs:
<<<: *defaults
argv:
--load-type: decode
121 changes: 121 additions & 0 deletions benchmarks/fsbench/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import argparse
import os

from giving import give
from torch.utils.data import DataLoader, Dataset
from torchvision.datasets import DatasetFolder, ImageFolder
from torchvision.transforms import ToTensor


class SqhDataset(Dataset):
def __init__(self, root):
self.root = root
labels = list(root)
self.files = []
for label in labels:
self.files.extend(map(lambda p: label + b"/" + p, root.cd(label)))

def __len__(self):
return len(self.files)

def __getitem__(self, idx):
path = self.files[idx]
label = int(path.split(b"/")[0])
return len(self.root.open(path, binary=True, buffering=0).readall()), label


def _ld(path):
with open(path, "rb") as f:
return len(f.read())


def make_loader(path, sub, shuffle, batch_size, loading_processes, load_type):
if load_type == "fs":
load = DatasetFolder(os.path.join(path, sub), loader=_ld, extensions=(".jpeg",))
elif load_type == "squash":
from pysquash import SquashCursor

load = SqhDataset(SquashCursor(path + ".sqh").cd(sub.encode("utf-8")))
elif load_type == "deeplake":
import deeplake
return deeplake.load(path + ".lake")[sub].pytorch(batch_size=batch_size, shuffle=shuffle, num_workers=loading_processes, tensors=["images", "labels"], return_index=False, tobytes=True)
elif load_type == "decode":
load = ImageFolder(path, transform=ToTensor())
else:
raise ValueError("unknown load_type")
return DataLoader(
load, batch_size=batch_size, shuffle=shuffle, num_workers=loading_processes
)


def main():
parser = argparse.ArgumentParser(description="Filesystem benchmarks")
parser.add_argument(
"--shuffle", type=bool, default=False, help="read dataset in a random order"
)
parser.add_argument("--batch-size", type=int, default=64)
parser.add_argument(
"--epochs-valid",
type=int,
default=10,
help="number of epochs between validations",
)
parser.add_argument(
"--iters", type=int, default=1, help="number of train/valid cycles to run"
)
parser.add_argument(
"--loading-processes",
type=int,
default=0,
help="number of external processes to use for loading (0 to disable)",
)
parser.add_argument(
"--load-type",
required=True,
choices=("fs", "squash", "deeplake", "decode"),
help="type of loading to test, 'fs' is raw filesystem with no decode, 'squash' is loading trough squashfile, and 'decode' if from the filesystem, but including image decode",
)

args = parser.parse_args()

data_directory = os.environ.get("MILABENCH_DIR_DATA", None)
dataset_dir = os.path.join(data_directory, "LargeFakeUniform")

train_loader = make_loader(
dataset_dir,
"train",
shuffle=args.shuffle,
batch_size=args.batch_size,
loading_processes=args.loading_processes,
load_type=args.load_type,
)
valid_loader = make_loader(
dataset_dir,
"val",
shuffle=False,
batch_size=args.batch_size,
loading_processes=args.loading_processes,
load_type=args.load_type,
)
test_loader = make_loader(
dataset_dir,
"test",
shuffle=False,
batch_size=args.batch_size,
loading_processes=args.loading_processes,
load_type=args.load_type,
)

for _ in range(args.iters):
for epoch in range(args.epochs_valid):
for inp, target in train_loader:
give(batch=inp, step=True)
for inp, target in valid_loader:
give(batch=inp, step=True)
for inp, target in test_loader:
give(batch=inp, step=True)

if __name__ == "__main__":
# Note: The line `if __name__ == "__main__"` is necessary for milabench
# to recognize the entry point (it does some funky stuff to it).
main()
2 changes: 2 additions & 0 deletions benchmarks/fsbench/manifest
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!_dev_*
50 changes: 50 additions & 0 deletions benchmarks/fsbench/prepare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env python
import os

from milabench.datasets.fake_images import generate_sets
from milabench.fs import XPath
import deeplake
import numpy as np


def make_deeplake_group(ds, folder, class_names):
group_name = os.path.basename(folder)

files_list = []
for dirpath, dirnames, filenames in os.walk(os.path.join(folder)):
for filename in filenames:
if filename == 'done':
continue
files_list.append(os.path.join(dirpath, filename))

with ds:
ds.create_group(group_name)
ds[group_name].create_tensor('images', htype='image', sample_compression='jpeg')
ds[group_name].create_tensor('labels', htype='class_label', class_names=class_names)
for f in files_list:
label_num = int(os.path.basename(os.path.dirname(f)))
ds[group_name].append({'images': deeplake.read(f), 'labels': np.uint16(label_num)})


# adjust the size of the generated dataset (1 = ~2Gb)
scale = 100
if __name__ == "__main__":
# If you need the whole configuration:
# config = json.loads(os.environ["MILABENCH_CONFIG"])

data_directory = os.environ["MILABENCH_DIR_DATA"]
dest = os.path.join(data_directory, "LargeFakeUniform")

generate_sets(dest, {"train": 14000 * scale, "val": 500 * scale, "test": 500 * scale}, (3, 512, 512))

root = dest + '.lake'
sentinel = XPath(root + '-done')
if sentinel.exists():
print(f"{root} was already generated")
else:
ds = deeplake.empty(dest + '.lake')
class_names = [str(i) for i in range(1000)]
make_deeplake_group(ds, os.path.join(dest, 'train'), class_names)
make_deeplake_group(ds, os.path.join(dest, 'val'), class_names)
make_deeplake_group(ds, os.path.join(dest, 'test'), class_names)
sentinel.touch()
4 changes: 4 additions & 0 deletions benchmarks/fsbench/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
torch
torchvision
tqdm
deeplake
27 changes: 27 additions & 0 deletions benchmarks/fsbench/voirfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Import this to instrument the ArgumentParser, remove if no such thing
from milabench.opt import instrument_argparse


def instrument_probes(ov):
# Probe for the necessary data. More information here:
# https://breuleux.github.io/milabench/instrument.html#probing-for-milabench

yield ov.phases.load_script

# loss
...

# batch + step
...

# use_cuda
...

# model
...

# loader
...

# batch + compute_start + compute_end
...