forked from tlkh/tf-metal-experiments
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtflops_sweep.py
105 lines (88 loc) · 2.9 KB
/
tflops_sweep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--iterations", default=5, type=int,
help="Number of iterations to run within each benchmark")
args = parser.parse_args()
import os
import time
import platform
import subprocess
from tqdm import tqdm
import tensorflow as tf
def get_mac_processor_name():
try:
result = subprocess.run(['sysctl', '-n', 'machdep.cpu.brand_string'], capture_output=True, text=True)
if result.returncode == 0:
return result.stdout.strip().replace(" ", "_")
else:
return "unknown_processor"
except Exception as e:
print(f"Error getting processor name: {e}")
return "unknown_processor"
# Determine the processor name based on the platform
if platform.system() == 'Darwin':
processor_name = get_mac_processor_name()
else:
processor_name = platform.processor().replace(" ", "_")
@tf.function(experimental_autograph_options=tf.autograph.experimental.Feature.ALL)
def do_op(a, b, num_matmul=10):
print("Tracing")
x = tf.linalg.matmul(a, b)
for _ in range(num_matmul-1):
x = tf.linalg.matmul(a, x)
return x
def benchmark_matmul(M, dtype=tf.float32, num_matmul=100, iterations=1):
# generate data
with tf.device("/GPU:0"):
A = tf.random.normal([M, M], mean=0, stddev=1, dtype=dtype)
B = tf.random.normal([M, M], mean=0, stddev=1, dtype=dtype)
# warm-up iteration
C = do_op(A, B, num_matmul=num_matmul)
C.numpy()
C = do_op(A, B, num_matmul=num_matmul)
C.numpy()
time.sleep(1)
# run benchmark
st = time.time()
for _ in range(iterations):
C = do_op(A, B, num_matmul=num_matmul)
C.numpy()
et = time.time()
duration = et-st
return num_matmul*iterations/duration
fp32_tflops = []
M_list = [32, 64, 128, 256, 512, 1024, 1536, 2048, 4096, 6144, 8192]
print("\nStarting burn...\n")
burn_start = time.time()
for M in tqdm(M_list):
print("FP32", M, end=" : ")
fps = benchmark_matmul(M, dtype=tf.float32, iterations=args.iterations)
tflops = fps * 2 * M**3 / 1e12
fp32_tflops.append(tflops)
print(tflops)
burn_end = time.time()
print("\nFinished in", int(burn_end-burn_start), "seconds\n")
max_tflop = max(fp32_tflops)
max_tflop_M = M_list[fp32_tflops.index(max_tflop)]
title = "All TFLOPS values"
print("")
print(title)
print("="*len(title))
print(fp32_tflops)
print("")
title = "Max TFLOPS achieved"
print("")
print(title)
print("="*len(title))
print("* FP32:", round(max_tflop, 1), "TFLOPS")
print("")
from matplotlib import pyplot as plt
plt.clf()
plt.figure(figsize=(10,6), dpi=100)
plt.title(title)
plt.plot(M_list, fp32_tflops, label="FP32", color="b")
plt.axvline(max_tflop_M, color="k", linestyle="--", linewidth=1, label="M="+str(max_tflop_M))
plt.xlabel("Matrix size M*M")
plt.ylabel("Achieved TFLOPS")
plt.legend()
plt.savefig(f"gpu_tflops_plot_{processor_name.lower()}.jpg")