forked from tlkh/tf-metal-experiments
-
Notifications
You must be signed in to change notification settings - Fork 1
/
bw_benchmark.py
116 lines (93 loc) · 2.84 KB
/
bw_benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--iterations", default=100, type=int,
help="Number of iterations to run within each benchmark")
parser.add_argument("--device1", default="/GPU:0", type=str)
parser.add_argument("--device2", default="/GPU:0", type=str)
parser.add_argument("--op", default="add", type=str)
args = parser.parse_args()
import time
import tensorflow as tf
#tf.debugging.set_log_device_placement(True)
#tf.config.set_visible_devices([], 'GPU')
# set this to at least the number of execution units for optimal measurement
# always set it as a integer multiple to avoid tile/wave quantization effect
D = 8192
#op_type = "mult-add"
op_type = "mult-add"
input_shape = (D,D,)
num_floats = D*D
op_flops = {
"add": num_floats,
"mult": num_floats,
"mult-add": 2*num_floats
}
flops = op_flops[op_type]
# note: 32b (bit) = 4B (byte)
# memory estimation:
# - add/mult
# 4B + 4B -> 4B
# total: 8B read + 4B write = 12B
# - mult-add
# 2x 4B + 4B -> 4B
# total: 16B read + 8B write = 24B
op_memory = {
"add": 12*num_floats,
"mult": 12*num_floats,
"mult-add": 24*num_floats
}
# 4 arrays of D*D * 4 bytes (32bit) * 2 access
mem_GB = op_memory[op_type]/(1e9)
print("")
print("Est. Memory:", round(mem_GB,1), "GB")
print(" FLOPs:", flops)
print("")
with tf.device(args.device1):
a = tf.random.normal(input_shape, dtype=tf.dtypes.float32)
b = tf.random.normal(input_shape, dtype=tf.dtypes.float32)
c = tf.random.normal(input_shape, dtype=tf.dtypes.float32)
@tf.function(experimental_autograph_options=tf.autograph.experimental.Feature.ALL)
def run_mult_add(a, b, c):
print("Tracing run_mult_add")
x = a*b+c
return x
@tf.function(experimental_autograph_options=tf.autograph.experimental.Feature.ALL)
def run_add(a, b, c):
print("Tracing run_add")
x = a+b
return x
if op_type == "add":
op = run_add
elif op_type == "mult-add":
op = run_mult_add
with tf.device(args.device2):
C = op(a, b, c)
print("\nStarting device:", a.device, a.dtype)
print("Ending device:", C.device, C.dtype)
C.numpy()
C = op(a, b, c)
C.numpy()
# measure overhead
st = time.time()
for i in range(30):
for j in range(1):
C = op(a, b, c)
C.numpy()
et = time.time()
overhead = (et-st)/30
print("\nOverhead:", overhead)
print("")
try:
while True:
st = time.time()
for i in range(args.iterations):
C = op(a, b, c)
C.numpy()
et = time.time()
duration = et-st-overhead
fps = args.iterations/duration
gflops = fps*flops/(1e9)
bw = fps*mem_GB
print("it/sec:", round(fps,1), "GFLOPS:", round(gflops,1), "est.BW:", round(bw,1), "GB/s")
except KeyboardInterrupt:
pass