forked from NVIDIA-Merlin/NVTabular
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataloader_bench.py
69 lines (57 loc) · 2.02 KB
/
dataloader_bench.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import argparse
import logging
import os
import sys
import time
sys.path.insert(1, "../")
def parse_args():
parser = argparse.ArgumentParser(description="Process some integers.")
parser.add_argument("gpu_id", help="gpu index to use")
parser.add_argument("in_dir", help="directory with dataset files inside")
parser.add_argument("in_file_type", help="type of file (i.e. parquet, csv, orc)")
parser.add_argument(
"gpu_mem_frac", help="the amount of gpu memory to use for dataloader in fraction"
)
return parser.parse_args()
args = parse_args()
print(args)
GPU_id = args.gpu_id
os.environ["CUDA_VISIBLE_DEVICES"] = str(GPU_id)
from nvtabular.torch_dataloader import AsyncTensorBatchDatasetItr
import nvtabular as nvt
logging.basicConfig()
logging.getLogger("nvtabular").setLevel(logging.DEBUG)
data_path = args.in_dir
train_paths = [os.path.join(data_path, x) for x in os.listdir(data_path) if x.endswith("parquet")]
print(train_paths)
# import pdb; pdb.set_trace()
train_set = nvt.Dataset(train_paths, engine="parquet", part_mem_fraction=float(args.gpu_mem_frac))
cont_names = ["I" + str(x).zfill(2) for x in range(1, 14)]
cat_names = ["C" + str(x).zfill(2) for x in range(1, 24)]
cols = ["label"] + cont_names + cat_names
results = {}
for batch_size in [2 ** i for i in range(9, 25, 1)]:
print("Checking batch size: ", batch_size)
num_iter = max(10 * 1000 * 1000 // batch_size, 100) # load 10e7 samples
# import pdb; pdb.set_trace()
t_batch_sets = AsyncTensorBatchDatasetItr(
train_set, cats=cat_names, conts=cont_names, labels=["label"], batch_size=batch_size,
)
start = time.time()
for i, data in enumerate(t_batch_sets):
if i >= num_iter:
break
del data
stop = time.time()
throughput = i * batch_size / (stop - start)
results[batch_size] = throughput
print(
"batch size: ",
batch_size,
", throughput: ",
throughput,
"items",
i * batch_size,
"time",
stop - start,
)