-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_isoflops.py
113 lines (95 loc) · 3.83 KB
/
run_isoflops.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from tqdm import tqdm
from char_scaling_laws import SmallTransformer
import math
import subprocess
import sys
import threading
import os
NUM_CUDA_DEVICES = 2
MAX_CONCURRENT_PROCESSES = 2
def count_params(d_model: int, n_heads: int, n_layers: int):
model = SmallTransformer(vocab_size=174, d_model=d_model, n_heads=n_heads, num_layers=n_layers)
return sum(p.numel() for p in model.parameters())
def flops_to_params_and_tokens(flops: int, param_count):
num_tokens = flops / (6 * param_count)
if num_tokens > 1_900_000_000: # too many tokens, this is the size of our dataset
return None
return num_tokens
semaphore = threading.Semaphore(MAX_CONCURRENT_PROCESSES)
def run_subprocess(command, env):
with semaphore:
try:
process = subprocess.Popen(
command,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
if process.returncode != 0:
print(f"Error running command: {' '.join(command)}", file=sys.stderr)
print(f"stderr: {stderr.decode()}", file=sys.stderr)
else:
print(f"Successfully executed: {' '.join(command)}")
except Exception as e:
print(f"Exception occurred while running command: {' '.join(command)}", file=sys.stderr)
print(str(e), file=sys.stderr)
flop_counts = [
1e15, # 1 PetaFLOP
3e15,
6e15,
1e16,
3e16, # 30 PetaFLOPs
]
model_sizes = [
(d, max(1, d//64), max(d//64, 2)) for d in range(256, 64 * 17, 64)
]
print("Model sizes:")
import matplotlib.pyplot as plt
xs = []
for model_size in model_sizes:
d_model, n_heads, n_layers = model_size
print(f"d_model={d_model}, n_heads={n_heads}, num_layers={n_layers}")
params = count_params(d_model, n_heads, n_layers)
xs.append(params)
plt.scatter(xs, [1 for _ in range(len(xs))])
plt.xscale("log")
plt.savefig("params.png")
plt.close("all")
count = 0
batch_size = 128
seq_len = 128
for flop_count in flop_counts:
print("=" * 30)
for d_model, n_heads, n_layers in model_sizes:
params = count_params(d_model, n_heads, n_layers)
tokens = flops_to_params_and_tokens(flop_count, params)
if tokens is None:
print(f"Skipping (tokens={tokens}) for flop_count={flop_count}, d_model={d_model}, n_heads={n_heads}, n_layers={n_layers}")
continue
train_iters = tokens / (batch_size * seq_len)
print(f"train_iters={train_iters}")
if train_iters < 100:
print(f"Skipping (train_iters={train_iters}) for flop_count={flop_count}, d_model={d_model}, n_heads={n_heads}, n_layers={n_layers}")
continue
s = f"\tTokens: {tokens:.2e}"
petaflops = flop_count / 1e15
print(f"petaflops={petaflops} | params={params} (d_model={d_model}, n_heads={n_heads}, n_layers={n_layers}) | tokens={s}")
cuda_device = count % NUM_CUDA_DEVICES # Adjust based on available CUDA devices
# Prepare the environment variables
env = os.environ.copy()
env["CUDA_VISIBLE_DEVICES"] = str(cuda_device)
command = [
sys.executable, # Ensures the same Python interpreter is used
"char_scaling_laws.py",
"--experiment_name", f"flops{petaflops}_d{d_model}_l{n_layers}_h{n_heads}_tokens{int(tokens)}_params{params}",
"--num_train_tokens", str(int(tokens)),
"--n_layers", str(n_layers),
"--d_model", str(d_model),
"--n_heads", str(n_heads),
"--batch_size", str(batch_size),
"--seq_len", str(seq_len),
]
print("Executing command:", " ".join(command))
threading.Thread(target=run_subprocess, args=(command, env)).start()
count += 1