forked from for-ai/compute-thresholds.github.io
-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.html
984 lines (895 loc) · 56.3 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<!-- <title>Redirecting to https://bob.github.io/repo/</title> -->
<meta http-equiv="refresh" content="0; URL=http://for.ai/compute-thresholds.github.io/">
<link rel="canonical" href="http://for.ai/compute-thresholds.github.io/">
<title>On the Limitations of Compute Thresholds as a Governance Strategy</title>
<!-- Twitter Card data -->
<meta name="twitter:card" value="summary">
<meta name="twitter:title" content="On the Limitations of Compute Thresholds as a Governance Strategy">
<!-- <meta name="twitter:description" content="What do pruned deep neural networks forget?"> -->
<meta name="twitter:url" content="http://for.ai/compute-thresholds/">
<meta name="twitter:image" content="https://cdn.glitch.com/02868eea-fe84-443e-964a-8f04885fa5fa%2Faccuracy_distribution_updated.png?v=1574118491306">
<meta name="twitter:site" content="@CohereForAI" />
<meta property="og:image:width" content="1920" />
<meta property="og:image:height" content="1080" />
<meta property="og:title" content="On the Limitations of Compute Thresholds as a Governance Strategy" />
<meta property="og:type" content="article" />
<!-- <meta property="og:description" content="What do pruned deep neural networks forget?" /> -->
<meta property="og:image" content="https://cdn.glitch.com/02868eea-fe84-443e-964a-8f04885fa5fa%2Faccuracy_distribution.png?v=1574118354833" />
<meta property="og:url" content="http://for.ai/compute-thresholds.github.io//" />
<!-- <meta property="og:site_name" content="Deep Neural Network Pruning"> -->
<meta property="og:locale" content="en_US">
<!-- https://scholar.google.com/intl/en/scholar/inclusion.html#indexing -->
<meta name="citation_title" content="On the Limitations of Compute Thresholds as a Governance Strategy: Measuring the Disparate Impact of Model Pruning">
<meta name="citation_fulltext_html_url" content="http://for.ai/compute-thresholds.github.io//">
<!-- Update paper link -->
<meta name="citation_pdf_url" content="https://arxiv.org/abs/1911.05248">
<meta name="citation_fulltext_world_readable" content="">
<meta name="citation_author" content="Hooker, Sara">
<meta name="citation_author_institution" content="Cohere For AI">
<!-- Update publication date -->
<meta name="citation_publication_date" content="2024/08/13">
<!-- Global site tag (gtag.js) - Google Analytics -->
<script async src="https://www.googletagmanager.com/gtag/js?id=UA-152824096-1"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'UA-152824096-1');
</script>
<!-- https://schema.org/Article -->
<meta property="description" itemprop="description" content="On the Limitations of Compute Thresholds as a Governance Strategy.">
<meta property="article:author" content="Sara Hooker">
<meta property="article:url" content="http://for.ai/compute-thresholds.github.io//" />
<link href="https://fonts.googleapis.com/css?family=Roboto:300,400" rel="stylesheet">
<link rel="stylesheet" href="https://code.getmdl.io/1.3.0/material.indigo-pink.min.css">
<style>
body {
font-family: "Roboto", "Helvetica", sans-serif;
margin: 0;
padding: 0;
display: flex;
flex-direction: column;
font-size: 12px;
}
html {
margin: 0;
padding: 0;
height: 100%;
}
table td {
font-size: 12px;
text-align: center;
outline: 1px solid white;
padding: 0;
margin: 0;
}
table.inner td {
padding: 0;
margin: 0;
border: 0;
width: 25%;
}
.footer-row {
height: 15px;
}
table.inner tr {
border: 0;
}
table.inner th {
padding: 8px;
}
table th {
font-size: 11px;
}
table {
border-collapse: collapse;
border-spacing: 0;
}
thead, tbody { display: block; }
.rotated {
transform: rotate(90deg);
transform-origin: left bottom 0;
margin-top: -111px;
font-weight: bold;
font-size: 1.2em;
padding: 8px;
}
#headers {
z-index: 1000;
background-color: white;
height: 65px;
vertical-align: middle;
border-bottom: 1px solid #ccc;
margin-bottom: 10px;
}
#headers span {
background-color: white;
display: inline-block;
line-height: 65px;
font-size: 1.2em;
font-weight: bold;
text-align: center;
text-overflow: ellipsis;
white-space: nowrap;
}
.cover {
background: #1e283a;
}
.cover-container {
padding-top: 10px;
padding-bottom: 60px;
}
.descriptions_, .description_ {
padding-top: 20px;
}
.cover-container, .descriptions_, .description_ {
padding-right: 5px;
padding-left: 5px;
margin-right: auto;
margin-left: auto;
}
@media (min-width: 415px) {
authors .authors-affiliations,
.base-grid, .imgs-container
.cover-container, .descriptions_, .description_, .column_portfolio, .column_portfolio_, .column_portfolio, .column_portfoliofinal {
width: 500px;
}
.column_portfolio_ .column_portfolio_final .column_portfolio figcaption, .column_portfolio_, .column_portfolio, .column_portfoliofinal {
padding: 0;
padding-top: 4px;
word-wrap: break-word;
word-break: break-word;
}
}
@media (min-width: 768px) {
authors .authors-affiliations, .imgs-container,
.cover-container, .descriptions_, .description_, .column_portfolio, .column_portfolio_ .column_portfolio .column_portfoliofinal {
width: 650px;
}
}
@media (min-width: 992px) {
authors .authors-affiliations, .imgs-container,
.cover-container, .descriptions_, .description_, .column_portfolio, .column_portfolio_, .column_portfolio, .column_portfoliofinal {
width: 770px;
}
}
@media (min-width: 1200px) {
authors .authors-affiliations, .imgs-container,
.cover-container, .descriptions_, .description_, .column_portfolio, .column_portfolio_, .column_portfolio, .column_portfoliofinal {
width: 970px;
}
}
.cover h1 {
font-family: "Roboto", "Gotham A", "Gotham B";
letter-spacing: 0.05em;
font-size: 63px;
font-weight: 700;
margin-bottom: 0.5em;
text-transform: uppercase;
}
.cover h3 {
font-size: 30px;
letter-spacing: 0.05em;
font-weight: 500;
}
.descriptions_ h3 {
color: #313b4e;
opacity: .8;
}
.descriptions_ p {
color: #313b4e;
opacity: .8;
font-size: 16px;
}
.cover {
color: #ddd;
}
.authors {
margin-top: -40px;
overflow: hidden;
border-top: 1px solid rgba(0, 0, 0, 0.1);
font-size: 1.5rem;
line-height: 1.8em;
padding: 1.5rem 0;
min-height: 1.8em;
}
.subtitle {
margin-top: -20px;
}
.icons {
margin-top: 30px;
padding-left: 4px;
}
.icons a {
display: inline-block;
font-size: 16px;
color: #ccc;
text-decoration: none;
}
.paper-icon {
display: inline-block;
}
.paper-icon a {
line-height: 35px;
vertical-align: top;
}
.paper-icon:hover a {
cursor: pointer;
text-decoration: underline;
}
.description_ p {
width: 100%;
font-size: 16px;
}
.description_ img {
vertical-align: middle;
width: 100%;
}
.imgs-container {
display: table-row;
}
.img-container {
color: #62779c;
text-align: center;
font-weight: bold;
font-size: 14px;
padding-right: 6px;
display: table-cell;
width: 33%;
}
#headers.fixed-header {
position: fixed;
top: 0;
}
#table-container.fixed-header {
margin-top: 106px;
}
.image-label {
font-size: 15px;
text-align: left;
padding-bottom: 4px;
padding-top: 6px;
padding-left: 2px;
font-weight: normal;
}
.img-times-selector-container {
margin-left: -80px;
margin-top: -45px;
font-size: 18px;
font-weight: bold;
text-align: center;
}
.img-times-selector {
width: 175px;
}
#table {
margin-top: 0px;
width: 100%;
}
* {
box-sizing: border-box;
}
/* Center website */
.row {
margin: 8px -16px;
}
/* Add padding BETWEEN each column (if you want) */
.row,
.row > .column_portfolio {
padding: 3px;
}
/* Create three equal columns that floats next to each other */
.column_portfolio {
float: left;
width: 33.33%;
display: none; /* Hide columns by default */
}
.column_portfolio_ .column_portfolio .column_portfoliofinal figcaption {
padding: 4px 8px;
word-wrap: break-all;
word-break: break-all;
}
/* Create three equal columns that floats next to each other */
.column_portfolio_ {
float: left;
width: 25.00%;
display: none; /* Hide columns by default */
}
.column_portfoliofinal {
float: left;
width: 100.00%;
display: none; /* Hide columns by default */
}
.column_portfoliofinalfinal {
float: left;
width: 25.00%;
display: none; /* Hide columns by default */
}
.column_header {
float: left;
width: 100.00%;
display: none; /* Hide columns by default */
}
.column_header_ {
float: left;
width: 100.00%;
display: none; /* Hide columns by default */
}
.column_headerfinal {
float: left;
width: 100.00%;
display: none; /* Hide columns by default */
}
.column_headerfinalfinal {
float: left;
width: 100.00%;
display: none; /* Hide columns by default */
}
.column_two_fig {
float: left;
width: 50.00%;
display: none; /* Hide columns by default */
}
/* Clear floats after rows */
.row:after {
content: "";
display: table;
clear: both;
}
/* Content */
.content {
background-color: white;
padding: 10px;
width: 80%;
margin-left: auto;
margin-right: auto;
}
.content_reduced {
background-color: white;
padding: 10px;
width: 70%;
margin-left: auto;
margin-right: auto;
}
.content_reduced_slightly {
background-color: white;
padding: 2px;
width: 50%;
margin-left: auto;
margin-right: auto;
}
.content_resized {
background-color: white;
padding: 2px;
width: 50%;
margin-left: auto;
margin-right: auto;
}
/* The "show" class is added to the filtered elements */
.show {
display: block;
}
/* Style the buttons */
.btn {
border: none;
border-radius: 4px;
outline: none;
padding: 12px 16px;
font-size: 14px;
background-color:#599bb3;
color:#ffffff;
background:linear-gradient(to bottom, #599bb3 5%, #408c99 100%);
text-shadow:0px 1px 0px #3d768a;
margin-right: auto;
margin-left: auto;
margin-bottom:5px;
cursor:pointer;
}
/* Add a grey background color on mouse-over */
.btn:hover {
background-color: #ddd;
}
/* Add a dark background color to the active button */
.btn.active_1, .btn.active_2, .btn.active_3, .btn.active_4, .btn:target
{ background:linear-gradient(to right, #666 3%, #666 100%);
color: white;
cursor:none;
}
figcaption,
.figcaption {
color: rgba(0, 0, 0, 0.6);
font-size: 14px;
font-weight: bold;
line-height: 1.5em;
}
figcaption a {
color: rgba(0, 0, 0, 0.6);
}
figcaption b,
figcaption .strong_, {
font-weight: bold;
font-size: 14px;
color: #180A3E;
}
</style>
</head>
<body>
<div id="scroll-container">
<div class="cover">
<div class="cover-container">
<div class="icons">
<div class="paper-icon">
<!-- Update Paper Link -->
<a href="https://arxiv.org/abs/1911.05248">
<img src="https://cdn.glitch.com/a08d19a0-dea5-4f06-9627-caa859e2d931%2Fpaper_icon.png?v=1572561063939" style="width: 100px"/><br>Paper
</a>
</div>
<!-- <div class="paper-icon" style="margin-left: 20px">
<a href="https://github.com/google-research/google-research/tree/master/pruning_identified_exemplars">
<img src="https://cdn.glitch.com/a08d19a0-dea5-4f06-9627-caa859e2d931%2Fcode_icon.png?v=1572562103868" style="width: 100px"/><br>Code
</a>
</div> -->
</div>
<div class="title"><h2>On the Limitations of Compute Thresholds as a Governance Strategy</h2></div>
<div class="authors">Sara Hooker</div>
<div class="institutions"></div>
</div>
</div>
<div class="descriptions_">
<h3>The Uncertain Relationship Between Compute and Risk</h3>
</div>
<div class="description_">
<p> Many inventions are re-purposed for means unintended by their designers. Initially, the magnetron tube
was developed for radar technology during World War II. In 1945, a self-taught American engineer, Percy Spencer,
noticed that a chocolate bar melted in his pocket whenever he was close to a radar set. This innocuous discovery
resulted in the patent for the first microwave \citep{inbook}. In a similar vein, deep neural networks only began
to work when an existing technology was unexpectedly re-purposed. A graphical processing unit (GPU) was originally
introduced in the 1970s as a specialized accelerator for video games and for developing graphics for movies and
animation. In the 2000s, like the magnetron tube, GPUs were re-purposed for an entirely unimagined use
case – to train deep neural networks \citep{Chellapilla2006,hooker2021,OH20041311kyoung,Payne2005}.
GPUs had one critical advantage over CPUs - they were far better at parallelizing matrix
multiples \citep{BRODTKORB20134,DettmersGPU}, a mathemetical operation which dominates the definition of deep
neural network layers \citep{fawzi2022discovering,davies2024}. This higher number of floating operation points
per second (FLOP/s) combined with the clever distribution of training between GPUs unblocked the training of
deeper networks. The depth of the network turned out to be critical. Performance on ImageNet jumped with ever
deeper networks in 2011 \citep{inproceedings2011}, 2012 \citep{Krizhevsky2012} and
2015 \citep{szegedy2014going}. A striking example of this jump in compute is a comparison of the now famous
2012 Google paper which used 16,000 CPU cores to classify cats \citep{le2012building} to a paper published a
mere year later that solved the same task with only two CPU cores and four GPUs \citep{coates13}. </p>
<p> This would ignite a rush for compute which has led to a bigger-is-better race in the number of model parameters
over the last decade \citep{2016Canziani,strubell2019energy,rae2021scaling,raffel2020exploring,bommasani2021opportunities,bender_gebru_2021}.
The computer scientist Ken Thompson famously said \textit{``When in doubt, use brute force.''}
This was formalized as the “bitter lesson” by Rich Sutton who posited that computer science history tells us that
throwing more compute at a problem has consistently outperformed all attempts to leverage human knowledge of a domain
to teach a model \citep{SilverBittrLesson}. In a punch to the ego of every computer scientist out there, what Sutton is
saying is that symbolic methods that codify human knowledge have not worked as well as letting a model learn patterns
for itself coupled with ever-vaster amounts of compute. </p>
<p> <b> Sutton right?</b> Certainly, he is correct that scaling has been a widely favored formula because
it has provided persuasive gains in overall performance – size is the most de-risked tool we have to unlock
new gains. As the computer scientist Michael Jordan quipped \textit{``Today we can’t think without holding a
piece of metal.''} Increasing compute also conveniently fits into the cadence of quarterly industry
planning, it is less risky to propose training a bigger model than it is to propose an alternative
optimization technique. However, relying on compute alone misses a critical shift that is underway in the
relationship between compute and performance. It is not always the case that bigger models result in better
performance. The bitter lesson doesn't explain why Falcon 180B \citep{almazrouei2023falconseriesopenlanguage} is
easily outperformed by far smaller open weights models such as Llama-3 8B \citep{llama3modelcard},
Command R 35B \citep{cohere_c4ai_command_r_plus}, Gemma 27B \citep{gemma_2024}. It also doesn't explain why
Aya 23 8B \citep{aryabumi2024aya} easily outperforms BLOOM 176 B \citep{workshop2023bloom176bparameteropenaccessmultilingual}
despite having only 4.5\% of the parameters. </p>
<!-- Add Figure 3 from Paper here -->
<!-- <div class="content_reduced_slightly">
<img src="https://cdn.glitch.com/f1ebd1ee-d1ac-4538-8ad5-0034e332e4ae%2Fsynaptic_pruning_image.png?v=1574277111414" alt="abstract_1" style="width:100%">
<div class="figcaption">
<strong_>Synaptic pruning removes redundant neurons and strengthens connections that are most useful for the environment. (Figure courtesy of Seeman, 1999)</strong_><br>
</div>
</div> -->
<p> These are not isolated examples, but rather indicative of an overall trend where there is no guarantee
larger models consistently outperform smaller models. Figure \ref{fig:above_13_b} plots the scores of models
submitted to the \href{https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard}{Open LLM Leaderboard}
over the last two years. Here, we plot \textit{large models} with more than 13 billion parameters whose leaderboard
score is less than the top performing \textit{small model} with less than 13 billion parameters.
We observe that over time, more and more large models have been submitted that are outperformed by the best
small model daily submission. To understand why this is the case, we must understand what key variables have been
driving gains in performance over the last decade. In an era where there are diminishing returns for the amount
of compute available \citep{lohn2022ai,2020Thompson}, optimization and architecture breakthroughs define the rate
of return for a given unit of compute. \textbf{It is this rate of return which is most critical to the pace of
progress and to the level of risk incurred by additional compute}.</p>
</div>
<div class="description_">
<h4> A shift in the relationship between compute and performance </h4>
<p> In complex systems, it is challenging to manipulate one variable in isolation and foresee all implications.
Throughout the 20th century doctors recommended removing tonsils in response to any swelling or infection,
but research has recently shown the removal may lead to higher incidence of throat cancer \citep{liang2023}.
Early televised drug prevention advertisements in the 2000s led to increased drug use \citep{Terry-McElrath2011}.
In a similar vein, the belief that more compute equates with more risk belies a far more complex picture that
requires re-examining the relationship between performance and compute. A key limitation of simply throwing
more scale at a task is that the relationship between additional compute and generalization remains poorly
understood. A growing body of research suggests that the relationship between compute and performance is far more
complex. Empirical evidence suggests that small models are rapidly becoming more performant and riskier. </p>
<p> <b>Data quality reduces reliance on compute.</b> Models trained on better data do not require as much compute.
A large body of work has emerged which shows that efforts to better curate training corpus, including
de-duping \citep{taylor2022galactica, kocetkov2022stack}, data pruning \citep{marion2023more,ayadata2024,sorscher2023neural,albalak2024survey,tirumala2023d4,chimoto2024critical}
or data prioritization \citep{boubdir2023prompts,thakkar2023selfinfluence} can compensate for more weights.
This suggests that the number of learnable parameters is not definitively the constraint on improving performance;
investments in better data quality mitigate the need for more weights \citep{ayadata2024,penedo2023refinedweb,raffel2020exploring,lee2022deduplicating}.
If the size of a training dataset can be reduced without impacting performance \citep{marion2023more},
training time is reduced. This directly impacts the number of training FLOP and means less compute is needed. </p>
<p> <b>Optimization breakthroughs compensate for compute.</b> Progress over the last few years has been as
much due to optimization improvements as it has been due to compute. This includes extending pre-training
with instruction finetuning to teach models instruction following \citep{singh2024aya}, model distillation
using synthetic data from larger more performant "teachers" to train highly capable, smaller
"students" \citep{gemmateam2024gemma,aryabumi2024aya}, chain-of-thought reasoning \citep{wei2023chainofthought,hsieh2023distilling},
increased context-length \citep{xiong2023effective}, enabled tool-use \citep{qin2023toolllm,wang2023voyager},
retrieval augmented generation \citep{pozzobon2023goodtriever,NEURIPS2020_6b493230}, and preference training to align
models with human feedback \citep{dang2024rlhfspeaklanguagesunlocking,ahmadian2024basics,ouyang2022LLMRLHF,bai2022constitutional,lee2023rlaif,tunstall2023zephyr,khalifa2021distributional,rafailov2023DPO,azar2023IPO}.
All these techniques compensate for the need for weights or expensive prolonged training \citep{ho2024algorithmicprogresslanguagemodels}.
All things equal, these have been shown to dramatically improve model performance relative to a model trained
without these optimization tricks given the same level of compute \citep{davidson2023ai,hernandez2020,erdil2023algorithmic,METR_undated,liu2024sophia}.
In Figure \ref{fig:13b_models}, we plot the best daily 13B or smaller model submitted to the \href{https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard}{Open LLM Leaderboard} over time.
In a mere span of 2 years, the best-performing daily scores from small model went from an average of 38.59\% across to an average of 77.15\% across 2024 submissions.
The takeaway is clear -- smaller models with the same amount of capacity are becoming more and more performant. </p>
<p> <b>Architecture plays a significant role in determining scalability</b> The introduction of a new architecture
design can fundamentally change the relationship between compute and
performance \citep{tay2022scaling,Sevilla_2022,ho2024algorithmic} and render any compute threshold that is
set irrelevant. For example, the key breakthroughs in AI adoption around the world were the introduction of
architectures like convolutional neural networks (CNNs) for vision \citep{inproceedings2011,Krizhevsky2012,szegedy2014going} and
Transformers for language modeling \citep{vaswani2023attention}. </p>
<p> While deep neural networks represent a huge step forward in performance for a given level of compute, what is
often missed is that our architectures also represent the ceiling in what is achievable through scaling.
While progress has revolved around deep neural networks for the last decade, there is much to suggest that the
next significant gain in efficiency will require an entirely different architecture. Deep neural networks remain
very inefficient as an algorithm. Our typical training regimes require that all examples are shown the same
number of times during the training \citep{xue2023adaptive}. All modern networks are trained based upon
minimization of average error \citep{Goodfellow-et-al-2016}. This means that learning rare artifacts requires
far more training time or capacity due to the diluted signal of infrequent attributes relative to the most
frequent patterns in the dataset \citep{Achille2017CriticalLP, jiang2020exploring, Mangalam2019DoDN, 2020fartash,frankle2020,pmlr-v70-arpit17a}.
Small models are already good at learning the most frequent features, and most easy features and common patterns are
learned early on training with much harder rare features learned in later stages \citep{agarwal2020estimating,paul2021deep,Mangalam2019DoDN,siddiqui2022metadata,abbe2021staircasepropertyhierarchicalstructure}.
When we radically scale the size of a model, we show the most gains in performance on are rare and underrepresented
attributes in the dataset -- the long-tail \citep{hooker2019compressed,hooker2020characterising}.
Put differently, scaling is being used to inefficiently learn a very small fraction of the overall training
dataset. Our reliance on global updates also results in catastrophic forgetting, where performance deteriorates
on the original task because the new information interferes with previously learned behavior \citep{Mcclelland1995,pozzobon2023goodtriever}.
All this suggests that our current architecture choices are probably not final and key disruptions lie ahead.
This is likely to radically change any scaling relationships, in the same way it has done in the last decade.
For example, it is unlikely any prediction of how compute scales based upon architectures before deep neural networks holds
true post-2012 after the introduction of convolutional neural networks.</p>
</div>
<!-- Uncomment below to create image container with key findings -->
<!-- <div class="imgs-container">
<div class="descriptions_">
<p> The primary findings of our work can be summarized as follows: </p>
<div class="img-container">1. Pruning would be better described as "selective brain damage." Pruning has a non-uniform impact across classes; a fraction of classes are disproportionately and systematically impacted by the introduction of sparsity.</div>
<div class="img-container">2. The examples most impacted by pruning, which we term <i>Pruning Identified Exemplars</i> (PIEs), are more challenging for both pruned and non-pruned models to classify.</div>
<div class="img-container">3. Pruning significantly reduces robustness to image corruptions and natural adversarial images.</div>
</div>
</div> -->
<div class="descriptions_">
<h3>Avoiding a FLOP FLOP</h3>
</div>
<div class="description_">
<p> <i>Are FLOP a reliable proxy for overall compute?</i> Even if the relationship between compute and generalization
were stable – there are difficulties operationalizing FLOP as a metric. FLOP \citep{Goldberg1991} refers
to <i>floating-point operations</i>, and has a fairly straightforward definition: sum up all the math operations in
floating point (such as addition, subtraction, multiplication, and division). In the 1950s and 1960s, as computers
were becoming more prevalent, the need for a standard measure of performance arose. FLOP are particularly useful in fields
that require floating-point calculations, such as scientific computations, advanced analytics, and 3D graphics processing.
This is because all these areas are dominated by simple primitive mathematical operations – for example, FLOP tend to be
closely associated with the size of models because deep neural network layers are dominated by a single
operation -- matrix multiplies -- which can be decomposed into a set of floating point operations \citep{fawzi2022discovering,davies2024}. </p>
<p> <b>We first begin by noting there are some reasons FLOP are attractive as a policy measure.</b> The primary one is
that FLOP provides a standardized way to compare across different hardware and software stacks. FLOP counts
don’t change across hardware – the number of mathematical operations is the same no matter what hardware you train a
model on. In a world where hardware is increasingly heterogeneous \citep{hooker2021} and it is hard to replicate the
exact training setting due to a lack of software portability \citep{NEURIPS2023_42c40aff}, it is attractive to use a
metric that doesn’t depend on replicating exact infrastructure. It also neatly sidesteps reporting issues that could
occur if relying only on the number of hardware devices used to train a model. The rapidly increasing performance of
new hardware generations \citep{epoch2023trendsinmachinelearninghardware}, as well as engineering investments in
training infrastructure \citep{yoo2022scalable,lepikhin2020gshard}, mean that over time much larger models will be
trained using the same number of devices. FLOP is also a metric which could potentially be inferred by cloud providers.
Given most machine learning workloads are run by a few key cloud providers, this may make administering such a measure
effectively easier \citep{heim2024governing}. </p>
<p> A key conundrum posed by FLOP thresholds is that policymakers are using FLOP as a proxy for risk, but
FLOP doesn’t say anything about end performance of a model --- only about the number of operations applied to the data.
For example, if you compare two models trained for the same number of FLOP but one has had safety alignment during
post-training \citep{aakanksha2024multilingualalignmentprismaligning,bai2022constitutional} and the other has
none – these two models will still be accorded the same level of risk according to number of FLOP but one will present
a far lower risk to society because of safety alignment. </p>
<p> Another key hurdle governance which adopts compute threshold will have to overcome is the lack of clear guidance
in all the policy to-date about how FLOP will actually be measured in practice. This ambiguity risks FLOP as a
metric being irrelevant or at the very least easy to manipulate. Developing principled standards for measuring any
metric of interest is essential for ensuring that safety measures are applied in a proportionate and appropriate way.
In the followings Section, we specify some of the key ways in which it is easy to manipulate FLOP if it is left
underspecified as a metric. </p>
</div>
<div class="descriptions_">
<h4> Challenges of using FLOP as a metric </h4>
<p> <b>Training FLOP doesn't account for post-training leaps in performance</b> Applying scrutiny and regulation based
upon training FLOP ignores that a lot of compute can be spent outside of training to improve performance of a model.
This can be grouped under <q>inference-time compute</q> and can result in large performance gains that dramatically
increase the risk profile of a model. The limited work to-date which has evaluated a subset
of <q>inference-time compute</q> improvements estimates these can impart gains between 5x and 20x of base level
post-training performance \citep{davidson2023ai}.<q>inference-time compute</q> includes best-of-n sampling
techniques \citep{geminiteam2024gemini}, chain-of-thought reasoning \citep{wei2023chainofthought,hsieh2023distilling,wang2023selfconsistency}
and model distillation using synthetic data \citep{aryabumi2024aya,shimabucoro2024llmseellmdo,ustun2024aya, geminiteam2024gemini}.
All these techniques require more compute at test-time because of the need to perform more forward passes of the
model to generate additional samples. However, these are not reflected in training time costs and indeed
can often <i>reduce</i> the compute needed during training. For example, smaller, more performant models are often
trained on smaller amounts of synthetic data from a highly performant teacher \citep{epoch2023tradingoffcomputeintrainingandinference,huang2022large}.
These improvements dramatically improve performance but are currently completely ignored by compute thresholds
since they don't contribute to <i>training</i> FLOP. </p>
<p> Increasing the context-length \citep{xiong2023effective} and retrieval augmented
systems \citep{lee2024longcontext,pozzobon2023goodtriever,NEURIPS2020_6b493230} are additional examples of
introducing additional computational overhead at test-time by increasing the number of tokens to process.
Retrieval augmented models (RAG) have become a mainstay of state-of-art models yet are often introduced after training.
Most RAG systems are critical for keeping models up-to-date with knowledge yet contribute minimal or no FLOP.
Retrieval augmented models are particularly good at supplementing models with search capabilities or external
knowledge, which can enhances risks which depend on up-to-date knowledge such as biorisk and cybersecurity threats. </p>
<p> Additionally increasing the context length often requires minimal FLOP but can dramatically increase performance
of a model. Entire books can be passed in at test time dramatically improving model performance on specialized
tasks (Gemini has 2M context window) \citep{xiong2023effective}. This can make the number of FLOP irrelevant if
sensitive biological data can be passed at inference time in a long-context window. </p>
<p> <b>Difficulty Tracking FLOP across model lifecycle.</b> Increasingly, training a model falls into distinct stages
that all confer different properties. For example, unsupervised pre-training dominates compute costs because the
volume of data is typically in the trillions of tokens \citep{epoch2023trendsinthedollartrainingcostofmachinelearningsystems,heim2023palm}.
Following this, there is instruction finetuning, which confers the model the ability to follow
instructions \citep{ayadata2024} and then preference training \citep{aakanksha2024multilingualalignmentprismaligning,ahmadian2024basics,bai2022constitutional,ouyang2022LLMRLHF,lee2023rlaif,tunstall2023zephyr,khalifa2021distributional,rafailov2023DPO,azar2023IPO},
which aligns model performance with human values. Between each of these steps models are often released
publicly \citep{ustun2024aya,touvron2023llama,aryabumi2024aya}, meaning that developers can take a model from a
different developer and continue optimizing. The models with the most downloads on platforms like HuggingFace are
base models which are most conducive for continued pre-training. As sharing of models at different stages of the
life-cycle becomes more common, so will difficulties in tallying FLOP across the entire model life-cycle.
Furthermore, it may simply be infeasible to trace federated, decentralized training of models where hardware often
belongs to many different participants and training is conducted in a privacy-preserving manner \citep{donyehiya2023cold,borzunov2023petals,yuan2023decentralizedtrainingfoundationmodels,qin2024federatedfullparametertuningbillionsized}. </p>
<p> <b>How to handle Mixture of Experts (MoEs) and classic ensembling?</b>
MoEs \citep{zadouri2023pushing,shazeer2018meshtensorflow,riquelme2021scaling,du2022glam,fedus2022switch,tan2024scattered}
are examples of adaptive compute -- where examples are routed to different parts of a model. This type of architecture
can often provide powerful efficiency gains, as despite a much larger overall architecture, only a subset of weights
are activated for a given example. Current policy frameworks do clearly not specify how to handle Mixture of
Experts (MoEs), which constitute some of the most highly performant systems currently deployed, such as
Mixtral \citep{jiang2024mixtral} and the Gemini family of models \citep{geminiteam2024gemini}. However, this raises
important questions – should the compute for each expert be counted towards total FLOP, or only the FLOP used to train
the subset of experts that are active at inference time? Given final performance depends on all experts in an
MoE, a recommendation should be to include all FLOP in the final consideration, but this is currently under-specified.
It also raises the question of how to treat new \emph{hybrid techniques} which train several specialized experts and then
both average parameters and utilize routing \citep{sukhbaatar2024branchtrainmix}. </p>
<p>Classical <i>simple ensembling techniques</i> dominate production systems in the real
world \citep{ko2023fairensemble,li2024agents} and have been shown to heavily outperform a single model.
Unlike MoEs which are jointly optimized or trained using a router, classic ensembles are often only combined at
inference time using simple averaging of weights. Given the ensemble is never trained together, it is unclear whether
FLOP should reflect the compute of the single final model or the sum of all the training compute across models that
were averaged. If it only reflects the FLOP of the final model, this may underestimate risk given ensembling is known
to improve performance. </p>
<p> <b>FLOP only accounts for a single model, but does not capture risk of the overall system.</b>
The emphasis on compute thresholds as an indicator of risk also implies that risk is the property of a single model
rather than the system in which it is deployed. In the real-world, impact and risk are rarely attributable to a
single model but are a facet of the entire system a model sits in and the way it interacts with its
environment \citep{compound-ai-blog,NIPS2015_86df7dcf,jatho2023concretesafetymlproblems,raji2020closingaiaccountabilitygap}.
Many real-world production systems are made up of cascading models where the final output is produced as a results of
inputs being processed by multiple algorithms in sequence \citep{paleyes2022,FrontierModelForum,NIPS2015_86df7dcf,shankar2022operationalizing}.
There has yet to be guidance on whether the FLOP threshold is specific to a single model or whether all models that
constitute an end-to-end system contribute to the final tally. This has significant implications for model
providers – a cascade system is often made up of models which are not individually very powerful or risky – yet the
overall system may exceed the FLOP threshold. </p>
<p> There is also no specification as to how to treat model agents which may interact with both each other and/or use tools.
End performance of the agents is undoubtedly due to the interactions with other agents and access to
tools \citep{li2024agents}, yet is unlikely to be considered a single model. It has already been shown that models
which are enabled with tool use, or can interact with a wider environment outperform a single model on its
own \citep{wang2023voyageropenendedembodiedagent,anwar2024foundationalchallengesassuringalignment,mialon2023augmentedlanguagemodelssurvey}.
These are far from edge cases; the reality is that most technology deployed in the wild is rarely just an algorithm is
isolation. Typically, interdependent models feed into a user experience and interact with a set of choices about design and
delivery that impact the overall level of risk. </p>
<p> <b>FLOP varies dramatically for different modalities.</b> In Figure \ref{fig:different_modalities}, we plot the
FLOP requirements over time of models grouped according to modality and downstream use
case (model FLOP data from \citet{epoch2023pcdtrends}). It is easy to observe that the compute requirements have not
increased at the same rate across modalities. For example, code models typically require less
compute \citep{lin2024scaling}, as do biological models \citep{epoch2024biologicalsequencemodelsinthecontextoftheaidirectives}.
Multilingual models \citep{ustun2024aya,aryabumi2024aya} tend to require more compute for each additional
language covered. This is often referred to as the \textit{curse of multilinguality} \citep{ustun2024aya,arivazhagan2019massively,conneau2019unsupervised,pfeiffer2022lifting},
where capacity is split between more languages such that performance on any given language suffers relative to a
monolingual (single language) model of the same size. These differing compute needs mean that a single threshold may
penalize some types of models and reward others. For example, thresholds may penalize multilingual models that attempt
to serve many languages and improve access to technology \citep{ustun2024aya,aryabumi2024aya}.</p>
<p> One way to address differences in modalities is to maintain different compute thresholds for each modality.
While at first glance this is an attractive solution, it also imposes more technical overhead on governments who
must correctly set a hard-coded benchmark for each modality. For example, it is interesting to note that the
US Executive Order already has at least one modality-specific caveat to the compute thresholds by carving out a
separate compute threshold for biological models. It is set lower for models trained for biological sequence data
at $10^{23}$. However, since the threshold was set, models like xTrimoPGLM \citep{chen2024xtrimopglm} already exceed
the biological threshold set at $1e23$ operations by a factor of 6x \citep{epoch2024biologicalsequencemodelsinthecontextoftheaidirectives}.
Many models \citep{lin2023,elnaggar2020,Dalla-Torre2023.01.11.523679} are currently within a factor of 10x the
Executive Order’s reporting threshold \citep{epoch2024biologicalsequencemodelsinthecontextoftheaidirectives}.
These models do not appear to present a decidedly different risk profile from previous generations, so if the goal
of the thresholds is to be an inflection point for amplified risk it is unclear if it has been set successfully. </p>
<p> Specifying separate thresholds for different modalities also risks inviting gamification. For example, to
avoid a lower threshold for scrutiny for biological models one loophole is to preserve biology specific training
data at less than 50\%. According to current guidance the model would no-longer qualify as a ``biological'' model and
would only be subject to the higher general purpose compute thresholds. Galactica-120B \citep{taylor2022galactica} and
Llama-molinst-protein-7b \citep{fang2024domainagnostic} are both examples of models with capabilities for biological
sequence modeling without primarily being trained on biological sequence data. Despite both presenting biological
capabilities, neither is likely to be considered ``biological'' under the current Executive Order requirements \citep{epoch2024biologicalsequencemodelsinthecontextoftheaidirectives}.
This highlights the fundamental tension of relying on compute alone -- since it is not anchored to the risk metric that is
of primary concern, it may be possible to sidestep in many creative ways while still presenting high-risk capabilities.</p>
<p> In Appendix \ref{sect:technical_details_FLOP}, we also present some more technical aspects of the difficulty of
measuring FLOP in practice, such as the difference between theoretical and hardware FLOP, and how to handle difference
in quantization. Developing principled standards for measuring FLOP is essential for ensuring that safety measures are
applied in a proportionate and appropriate way. </p>
</div>
<div class="descriptions_">
<h3>We are not very good at predicting the relationship between compute and risk.</h3>
</div>
<div class="description_">
<p> The choice of where compute thresholds are set will have far-ranging implications – too low and too many models will be selected for additional auditing and benchmarking each year. In contrast, if it is set too high, not enough models will be audited for risk, and the threshold risks become decorative rather than a meaningful indicator of risk. None of the policies to date have provided justification about where they have set their thresholds, or why it excludes almost all models deployed in the wild today. In Section \ref{sect:tradeoff_compute_performance}, we grappled with the changing overall relationship between compute and performance. However, scientific justification for a threshold requires predicting how downstream risk scales with additional compute. Indeed, ideally the choice of hard coded threshold reflects scientific consensus as to when particular risk factors are expected to emerge due to scale. Hence, it is worth considering our success to date in estimating how different model properties change with scale. </p>
<p> Warren Buffet once said <i><q>Don’t ask the barber if you need a haircut.</q></i> In the same vein, don’t ask a computer scientist or economist whether you can predict the future. The temptation to say yes often overrides a necessary humility about what can and cannot be predicted accurately. One such area where hubris has overridden common sense is attempts to predict the relationship between scale and performance in the form of \textit{scaling laws} \citep{kaplan2020scaling,hernandez2021scaling,Dhariwal2021DataAP} which either try and predict how a model's pre-training loss scales \citep{bowman2023things} or how downstream properties emerge with scale. It is the latter task which is urgently needed by policymakers in order to anticipate the emergence of unsafe capabilities and inform restrictions (such as compute thresholds) at inflection points where risk increases with scale \citep{anthropic_responsible_scaling,openai_global_affairs, kaminski_regulating_2023}. </p>
<p> One of the biggest limitations of scaling laws is that they have only been shown to hold when predicting a model’s pre-training test loss \citep{bowman2023things}, which measures the model’s ability to correctly predict how an incomplete piece of text will be continued. Indeed, when actual performance on downstream tasks is used, the results are often murky or inconsistent \citep{Ganguli_2022,schaeffer2023emergent,anwar2024foundational,Ganguli_2022,schaeffer2024predictingdownstreamcapabilitiesfrontier,hu2024predictingemergentabilitiesinfinite}. Indeed, the term \textit{emerging properties} is often used to describe this discrepancy \citep{Wei2022,srivastava2023imitation}: a property that appears “suddenly” as the complexity of the system increases and cannot be predicted. Emergent properties imply that scaling laws don't hold when you try to predict downstream performance instead of predicting test loss for the next word token. </p>
<p> Even when limited to predicting test loss, there have been issues with replicability of scaling results under slightly different assumptions about the distribution \citep{besiroglu2024chinchilla,anwar2024foundationalchallengesassuringalignment}. Research has also increasingly found that many downstream capabilities display irregular scaling curves \citep{srivastava2023imitation} or non power-law scaling \citep{caballero2023broken}. For complex systems that require projecting into the future, small errors end up accumulating due to time step dependencies being modelled. This makes accurate predictions of when risks will emerge inherently hard, which is compounded by the small samples sizes often available for analysis. each data point is a model, and computation cost means scaling ``laws'' are frequently based upon analysis of less than 100 data points \citep{ruan2024observationalscalinglawspredictability}). This means many reported power law relationships can lack statistical support and power \citep{powerlawtruths}.</p>
<p> One immediate recommendation is that the accuracy of scaling laws and predictions of emerging risk can be greatly improved by more guidance from policymakers about what range is of interest and specifying the risks that policymakers are concerned about \citep{powerlawtruths}. For example, there is a big difference between using scaling laws to optimize for the correct amount of training data in your next large-scale run versus attempting to extrapolate trends several orders of magnitude out. Typically, policy use cases demand high precision over a longer time horizon, which is exactly the type of extrapolation we are currently worst at. Specifying which risks are of interest will also benefit precision; scaling laws tend to have high variance in precision between tasks. For example, code-generation has shown fairly predictable power law scaling across 10 orders of magnitude of compute \citep{hu2024predictingemergentabilitiesinfinite,anwar2024foundational}. However, other capabilities have been far shown to scale far more erratically \citep{srivastava2023imitation,caballero2023broken}. Perhaps as important, policymakers should be aware that accurately predicting the impact of scaling is currently far from feasible. Hence, there is currently limited scientific support for using exact thresholds of compute alone to triage different risk levels.</p>
</div>
<script>
filterSelection("atypical") // Execute the function and show all columns
function filterSelection(c) {
var x, y, i;
x = document.getElementsByClassName("column_portfolio_");
y = document.getElementsByClassName("column_header_");
// Add the "show" class (display:block) to the filtered elements,
// and remove the "show" class from the elements that are not selected
for (i = 0; i < x.length; i++) {
RemoveClass(x[i], "show");
if (x[i].className.indexOf(c) > -1) AddClass(x[i], "show");
}
for (i = 0; i < y.length; i++) {
RemoveClass(y[i], "show");
if (y[i].className.indexOf(c) > -1) AddClass(y[i], "show");
}
}
filterSelection_("pie") // Execute the function and show all columns
function filterSelection_(c) {
var x, y, z, i;
x = document.getElementsByClassName("column_portfolio");
y = document.getElementsByClassName("column_header");
z = document.getElementsByClassName("column_two_fig");
// Add the "show" class (display:block) to the filtered elements,
// and remove the "show" class from the elements that are not selected
for (i = 0; i < x.length; i++) {
RemoveClass(x[i], "show");
if (x[i].className.indexOf(c) > -1) AddClass(x[i], "show");
}
for (i = 0; i < y.length; i++) {
RemoveClass(y[i], "show");
if (y[i].className.indexOf(c) > -1) AddClass(y[i], "show");
}
for (i = 0; i < z.length; i++) {
RemoveClass(z[i], "show");
if (z[i].className.indexOf(c) > -1) AddClass(z[i], "show");
}
}
filterSelectionfinal("thirty") // Execute the function and show all columns
function filterSelectionfinal(c) {
var x, y, i;
x = document.getElementsByClassName("column_portfoliofinal");
y = document.getElementsByClassName("column_headerfinal");
// Add the "show" class (display:block) to the filtered elements,
// and remove the "show" class from the elements that are not selected
for (i = 0; i < x.length; i++) {
RemoveClass(x[i], "show");
if (x[i].className.indexOf(c) > -1) AddClass(x[i], "show");
}
for (i = 0; i < y.length; i++) {
RemoveClass(y[i], "show");
if (y[i].className.indexOf(c) > -1) AddClass(y[i], "show");
}
}
filterSelectionfinalfinal("frequently") // Execute the function and show all columns
function filterSelectionfinalfinal(c) {
var x, y, i;
x = document.getElementsByClassName("column_portfoliofinalfinal");
y = document.getElementsByClassName("column_headerfinalfinal");
// Add the "show" class (display:block) to the filtered elements,
// and remove the "show" class from the elements that are not selected
for (i = 0; i < x.length; i++) {
RemoveClass(x[i], "show");
if (x[i].className.indexOf(c) > -1) AddClass(x[i], "show");
}
for (i = 0; i < y.length; i++) {
RemoveClass(y[i], "show");
if (y[i].className.indexOf(c) > -1) AddClass(y[i], "show");
}
}
// Show filtered elements
function AddClass(element, name) {
var i, arr1, arr2;
arr1 = element.className.split(" ");
arr2 = name.split(" ");
for (i = 0; i < arr2.length; i++) {
if (arr1.indexOf(arr2[i]) == -1) {
element.className += " " + arr2[i];
}
}
}
// Hide elements that are not selected
function RemoveClass(element, name) {
var i, arr1, arr2;
arr1 = element.className.split(" ");
arr2 = name.split(" ");
for (i = 0; i < arr2.length; i++) {
while (arr1.indexOf(arr2[i]) > -1) {
arr1.splice(arr1.indexOf(arr2[i]), 1);
}
}
element.className = arr1.join(" ");
}
// Add active class to the current button (highlight it)
var btnContainer1 = document.getElementById("myBtnContainer");
var btns1 = btnContainer1.getElementsByClassName("btn");
for (var i = 0; i < btns1.length; i++) {
btns1[i].addEventListener("click", function(){
var current1 = document.getElementsByClassName("active_1");
current1[0].className = current1[0].className.replace(" active_1", "");
this.className += " active_1";
});
}
// Add active class to the current button (highlight it)
var btnContainer2 = document.getElementById("myBtnContainer_2");
var btns2 = btnContainer2.getElementsByClassName("btn");
for (var i = 0; i < btns2.length; i++) {
btns2[i].addEventListener("click", function(){
var current2 = document.getElementsByClassName("active_2");
current2[0].className = current2[0].className.replace(" active_2", "");
this.className += " active_2";
});
}
// Add active class to the current button (highlight it)
var btnContainer3 = document.getElementById("myBtnContainer_3");
var btns3 = btnContainer3.getElementsByClassName("btn");
for (var i = 0; i < btns3.length; i++) {
btns3[i].addEventListener("click", function(){
var current3 = document.getElementsByClassName("active_3");
current3[0].className = current3[0].className.replace(" active_3", "");
this.className += " active_3";
});
}
// Add active class to the current button (highlight it)
var btnContainer4 = document.getElementById("myBtnContainer_4");
var btns4 = btnContainer4.getElementsByClassName("btn");
for (var i = 0; i < btns4.length; i++) {
btns4[i].addEventListener("click", function(){
var current4 = document.getElementsByClassName("active_4");
current4[0].className = current4[0].className.replace(" active_4", "");
this.className += " active_4";
});
}
</script>
</div>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.5.1/katex.min.css">
<script src="template.v1.js"></script>
<dt-appendix>
<div class="description_">
<h3>Acknowledgments</h3>
<p> A special thank you is due to ...
This article was in part prepared using the <a href="https://pair-code.github.io/saliency/">Google AI Pair</a> template and style guide.
The citation management for this article uses the <a href="https://github.com/distillpub/template">template v1</a> of the Distill style script. </p>
<p>We thank the ... </p>
<p> We thank the .... </p>
<h3>Citation</h3>
<pre class="citation long">@article{hooker2024compute,
title={On the Limitations of Compute Thresholds as a Governance Strategy},
author={Sara Hooker},
year={2019},
<!-- url={https://arxiv.org/abs/1911.05248}, -->
<!-- eprint={1911.05248}, -->
archivePrefix={arXiv},
primaryClass={cs.LG}
}
</pre>
</div>
</dt-appendix>
<div class="description_">
<h3>Bibliography</h3>
</div>
<script type="text/bibliography">
</script>
<script language="javascript" type="text/javascript" src="lib/jquery-1.12.4.min.js"></script>