-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathexample_commands.sh
148 lines (138 loc) · 4.44 KB
/
example_commands.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# OpenAI - o3-mini-2025-01-31
export API=openai
export MODEL=o3-mini-2025-01-31
export OPENAI_API_KEY=...
for DATA_SOURCE in nikoli_100 challenge_100; do
python -m eval.run \
--dataset ${DATA_SOURCE} \
--output_csv ../data/benchmark_results/${DATA_SOURCE}/${MODEL}.csv \
--api ${API} \
--model ${MODEL} \
--batch_size 20
done
# OpenAI - gpt-4o-2024-11-20
export API=openai
export MODEL=gpt-4o-2024-11-20
export OPENAI_API_KEY=...
for DATA_SOURCE in nikoli_100 challenge_100; do
python -m eval.run \
--dataset ${DATA_SOURCE} \
--output_csv ../data/benchmark_results/${DATA_SOURCE}/${MODEL}.csv \
--api ${API} \
--model ${MODEL} \
--batch_size 20
done
# OpenAI - gpt-4o-mini-2024-07-18
export API=openai
export MODEL=gpt-4o-mini-2024-07-18
export OPENAI_API_KEY=...
for DATA_SOURCE in nikoli_100 challenge_100; do
python -m eval.run \
--dataset ${DATA_SOURCE} \
--output_csv ../data/benchmark_results/${DATA_SOURCE}/${MODEL}.csv \
--api ${API} \
--model ${MODEL} \
--batch_size 20
done
# Anthropic - claude-3-7-sonnet-20250219
export API=anthropic
export MODEL=claude-3-7-sonnet-20250219
export ANTHROPIC_API_KEY=...
for DATA_SOURCE in nikoli_100 challenge_100; do
python -m eval.run \
--dataset ${DATA_SOURCE} \
--output_csv ../data/benchmark_results/${DATA_SOURCE}/${MODEL}.csv \
--api ${API} \
--model ${MODEL} \
--batch_size 20
done
# Anthropic - claude-3-5-haiku-20241022
export API=anthropic
export MODEL=claude-3-5-haiku-20241022
export ANTHROPIC_API_KEY=...
for DATA_SOURCE in nikoli_100 challenge_100; do
python -m eval.run \
--dataset ${DATA_SOURCE} \
--output_csv ../data/benchmark_results/${DATA_SOURCE}/${MODEL}.csv \
--api ${API} \
--model ${MODEL} \
--batch_size 20
done
# deepseek-ai/DeepSeek-V3
export API=togetherai
export MODEL=deepseek-ai/DeepSeek-V3
export TOGETHERAI_API_KEY=...
for DATA_SOURCE in nikoli_100 challenge_100; do
python -m eval.run \
--dataset ${DATA_SOURCE} \
--output_csv ../data/benchmark_results/${DATA_SOURCE}/${MODEL}.csv \
--api ${API} \
--model ${MODEL} \
--batch_size 20
done
# Qwen/QwQ-32B-AWQ
export API=vllm
export MODEL=Qwen/QwQ-32B-AWQ
for DATA_SOURCE in nikoli_100 challenge_100; do
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m eval.run \
--dataset ${DATA_SOURCE} \
--output_csv ../data/benchmark_results/${DATA_SOURCE}/${MODEL}.csv \
--api ${API} \
--model ${MODEL} \
--tensor_parallel_size 4 \
--batch_size 1024 \
--temperature 0.6 \
--max_tokens 20000
done
# Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4
export API=vllm
export MODEL=Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4
for DATA_SOURCE in nikoli_100 challenge_100; do
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m eval.run \
--dataset ${DATA_SOURCE} \
--output_csv ../data/benchmark_results/${DATA_SOURCE}/${MODEL}.csv \
--api ${API} \
--model ${MODEL} \
--tensor_parallel_size 4 \
--n_response_idxs 0 1 2 \
--batch_size 1024
done
# meta-llama/Llama-3.3-70B-Instruct
export API=vllm
export MODEL=Llama/Llama-3.3-70B-Instruct
for DATA_SOURCE in nikoli_100 challenge_100; do
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m eval.run \
--dataset ${DATA_SOURCE} \
--output_csv ../data/benchmark_results/${DATA_SOURCE}/${MODEL}.csv \
--api ${API} \
--model ${MODEL} \
--tensor_parallel_size 4 \
--n_response_idxs 0 1 2 \
--batch_size 1024
done
# RekaAI/reka-flash-3
export API=vllm
export MODEL=RekaAI/reka-flash-3
for DATA_SOURCE in nikoli_100 challenge_100; do
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m eval.run \
--dataset ${DATA_SOURCE} \
--output_csv ../data/benchmark_results/${DATA_SOURCE}/${MODEL}.csv \
--api ${API} \
--model ${MODEL} \
--tensor_parallel_size 4 \
--n_response_idxs 0 1 2 \
--batch_size 1024
done
# google/gemma-3-27b-it
export API=vllm
export MODEL=google/gemma-3-27b-it
for DATA_SOURCE in nikoli_100 challenge_100; do
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m eval.run \
--dataset ${DATA_SOURCE} \
--output_csv ../data/benchmark_results/${DATA_SOURCE}/${MODEL}.csv \
--api ${API} \
--model ${MODEL} \
--tensor_parallel_size 4 \
--n_response_idxs 0 1 2 \
--batch_size 1024
done