-
Notifications
You must be signed in to change notification settings - Fork 0
/
temp.bib
1205 lines (1102 loc) · 67 KB
/
temp.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@article{yin2021tt,
title={{Tt-rec: Tensor train compression for deep learning recommendation models}},
author={Yin, Chunxing and Acun, Bilge and Wu, Carole-Jean and Liu, Xing},
journal={Proceedings of Machine Learning and Systems},
volume={3},
pages={448--462},
year={2021}
}
@misc{KelSolaar2023,
author = {KelSolaar},
title = {fvvt-kels-utilities},
year = {2023},
note = {GitHub repository},
howpublished = {\url{https://github.com/KelSolaar/fvvt-kels-utilities.git}}
}
@inproceedings{mentzer2019practical,
Author = {Mentzer, Fabian and Agustsson, Eirikur and Tschannen, Michael and Timofte, Radu and Van Gool, Luc},
Booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
Title = {Practical Full Resolution Learned Lossless Image Compression},
Year = {2019}}
@misc{brown2020language,
title={{Language Models are Few-Shot Learners}},
author={Tom B. Brown and Benjamin Mann and Nick Ryder and Melanie Subbiah and Jared Kaplan and Prafulla Dhariwal and Arvind Neelakantan and Pranav Shyam and Girish Sastry and Amanda Askell and Sandhini Agarwal and Ariel Herbert-Voss and Gretchen Krueger and Tom Henighan and Rewon Child and Aditya Ramesh and Daniel M. Ziegler and Jeffrey Wu and Clemens Winter and Christopher Hesse and Mark Chen and Eric Sigler and Mateusz Litwin and Scott Gray and Benjamin Chess and Jack Clark and Christopher Berner and Sam McCandlish and Alec Radford and Ilya Sutskever and Dario Amodei},
year={2020},
eprint={2005.14165},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@online{MosaicML2023Introducing,
author = {MosaicML NLP Team},
title = {Introducing MPT-7B: A New Standard for Open-Source,
Commercially Usable LLMs},
year = {2023},
url = {www.mosaicml.com/blog/mpt-7b},
note = {Accessed: 2023-05-05},
urldate = {2023-05-05}
}
@misc{wikitext,
title={{Pointer Sentinel Mixture Models}},
author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
year={2016},
eprint={1609.07843},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{zheng2023judging,
title={{Judging LLM-as-a-judge with MT-Bench and Chatbot Arena}},
author={Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric. P Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
year={2023},
eprint={2306.05685},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@article{atlas,
title={{Few-shot learning with retrieval augmented language models}},
author={Izacard, Gautier and Lewis, Patrick and Lomeli, Maria and Hosseini, Lucas and Petroni, Fabio and Schick, Timo and Dwivedi-Yu, Jane and Joulin, Armand and Riedel, Sebastian and Grave, Edouard},
journal={arXiv preprint arXiv:2208.03299},
year={2022}
}
@misc{ram2023incontext,
title={{In-Context Retrieval-Augmented Language Models}},
author={Ori Ram and Yoav Levine and Itay Dalmedigos and Dor Muhlgay and Amnon Shashua and Kevin Leyton-Brown and Yoav Shoham},
year={2023},
eprint={2302.00083},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@article{liu2023scissorhands,
title={{Scissorhands: Exploiting the Persistence of Importance Hypothesis for LLM KV Cache Compression at Test Time}},
author={Liu, Zichang and Desai, Aditya and Liao, Fangshuo and Wang, Weitao and Xie, Victor and Xu, Zhaozhuo and Kyrillidis, Anastasios and Shrivastava, Anshumali},
journal={arXiv preprint arXiv:2305.17118},
year={2023}
}
@misc{fid,
title={{Leveraging Passage Retrieval with Generative Models for Open Domain Question Answering}},
author={Gautier Izacard and Edouard Grave},
year={2021},
eprint={2007.01282},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{izacard2020memory,
title={{A Memory Efficient Baseline for Open Domain Question Answering}},
author={Gautier Izacard and Fabio Petroni and Lucas Hosseini and Nicola De Cao and Sebastian Riedel and Edouard Grave},
year={2020},
eprint={2012.15156},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{ainslie2020etc,
title={{ETC: Encoding Long and Structured Inputs in Transformers}},
author={Joshua Ainslie and Santiago Ontanon and Chris Alberti and Vaclav Cvicek and Zachary Fisher and Philip Pham and Anirudh Ravula and Sumit Sanghai and Qifan Wang and Li Yang},
year={2020},
eprint={2004.08483},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{zaheer2021big,
title={{Big Bird: Transformers for Longer Sequences}},
author={Manzil Zaheer and Guru Guruganesh and Avinava Dubey and Joshua Ainslie and Chris Alberti and Santiago Ontanon and Philip Pham and Anirudh Ravula and Qifan Wang and Li Yang and Amr Ahmed},
year={2021},
eprint={2007.14062},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@article{lewis2020retrieval,
title={{Retrieval-augmented generation for knowledge-intensive nlp tasks}},
author={Lewis, Patrick and Perez, Ethan and Piktus, Aleksandra and Petroni, Fabio and Karpukhin, Vladimir and Goyal, Naman and K{\"u}ttler, Heinrich and Lewis, Mike and Yih, Wen-tau and Rockt{\"a}schel, Tim and others},
journal={Advances in Neural Information Processing Systems},
volume={33},
pages={9459--9474},
year={2020}
}
@article{rubin2023long,
title={{Long-range Language Modeling with Self-retrieval}},
author={Rubin, Ohad and Berant, Jonathan},
journal={arXiv preprint arXiv:2306.13421},
year={2023}
}
@misc{ding2023longnet,
title={{LongNet: Scaling Transformers to 1,000,000,000 Tokens}},
author={Jiayu Ding and Shuming Ma and Li Dong and Xingxing Zhang and Shaohan Huang and Wenhui Wang and Nanning Zheng and Furu Wei},
year={2023},
eprint={2307.02486},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@InProceedings{pmlr-v162-hawthorne22a,
title = {General-purpose, long-context autoregressive modeling with Perceiver {AR}},
author = {Hawthorne, Curtis and Jaegle, Andrew and Cangea, C{\u{a}}t{\u{a}}lina and Borgeaud, Sebastian and Nash, Charlie and Malinowski, Mateusz and Dieleman, Sander and Vinyals, Oriol and Botvinick, Matthew and Simon, Ian and Sheahan, Hannah and Zeghidour, Neil and Alayrac, Jean-Baptiste and Carreira, Joao and Engel, Jesse},
booktitle = {Proceedings of the 39th International Conference on Machine Learning},
pages = {8535--8558},
year = {2022},
editor = {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},
volume = {162},
series = {Proceedings of Machine Learning Research},
month = {17--23 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v162/hawthorne22a/hawthorne22a.pdf},
url = {https://proceedings.mlr.press/v162/hawthorne22a.html},
abstract = {Real-world data is high-dimensional: a book, image, or musical performance can easily contain hundreds of thousands of elements even after compression. However, the most commonly used autoregressive models, Transformers, are prohibitively expensive to scale to the number of inputs and layers needed to capture this long-range structure. We develop Perceiver AR, an autoregressive, modality-agnostic architecture which uses cross-attention to map long-range inputs to a small number of latents while also maintaining end-to-end causal masking. Perceiver AR can directly attend to over a hundred thousand tokens, enabling practical long-context density estimation without the need for hand-crafted sparsity patterns or memory mechanisms. When trained on images or music, Perceiver AR generates outputs with clear long-term coherence and structure. Our architecture also obtains state-of-the-art likelihood on long-sequence benchmarks, including 64x64 ImageNet images and PG-19 books.}
}
@misc{retro,
title={{Improving language models by retrieving from trillions of tokens}},
author={Sebastian Borgeaud and Arthur Mensch and Jordan Hoffmann and Trevor Cai and Eliza Rutherford and Katie Millican and George van den Driessche and Jean-Baptiste Lespiau and Bogdan Damoc and Aidan Clark and Diego de Las Casas and Aurelia Guy and Jacob Menick and Roman Ring and Tom Hennigan and Saffron Huang and Loren Maggiore and Chris Jones and Albin Cassirer and Andy Brock and Michela Paganini and Geoffrey Irving and Oriol Vinyals and Simon Osindero and Karen Simonyan and Jack W. Rae and Erich Elsen and Laurent Sifre},
year={2022},
eprint={2112.04426},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@inproceedings{MLSYS2022_773862fc,
author = {Agarwal, Saurabh and Wang, Hongyi and Venkataraman, Shivaram and Papailiopoulos, Dimitris},
booktitle = {Proceedings of Machine Learning and Systems},
editor = {D. Marculescu and Y. Chi and C. Wu},
pages = {652--672},
title = {On the Utility of Gradient Compression in Distributed Training Systems},
url = {https://proceedings.mlsys.org/paper_files/paper/2022/file/773862fcc2e29f650d68960ba5bd1101-Paper.pdf},
volume = {4},
year = {2022}
}
@misc{bernstein2018signsgd,
title={{signSGD: Compressed Optimisation for Non-Convex Problems}},
author={Jeremy Bernstein and Yu-Xiang Wang and Kamyar Azizzadenesheli and Anima Anandkumar},
year={2018},
eprint={1802.04434},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@article{shi2023replug,
title={{Replug: Retrieval-augmented black-box language models}},
author={Shi, Weijia and Min, Sewon and Yasunaga, Michihiro and Seo, Minjoon and James, Rich and Lewis, Mike and Zettlemoyer, Luke and Yih, Wen-tau},
journal={arXiv preprint arXiv:2301.12652},
year={2023}
}
@misc{usagereport,
author = {Author's Name},
title = {We Analyzed Millions of ChatGPT User Sessions: Visits are Down 29\% Since May; Programming Assistance is 30\% of Use},
year = {2023},
note = {SparkToro Blog},
url = {https://sparktoro.com/blog/we-analyzed-millions-of-chatgpt-user-sessions-visits-are-down-29-since-may-programming-assistance-is-30-of-use/}
}
@article{ppl2,
author = "Stanley F Chen and Douglas Beeferman and Roni Rosenfeld",
title = "{Evaluation Metrics For Language Models}",
year = "2008",
month = "1",
url = "https://kilthub.cmu.edu/articles/journal_contribution/Evaluation_Metrics_For_Language_Models/6605324",
doi = "10.1184/R1/6605324.v1"
}
@inproceedings{ppl1,
author = {Azzopardi, Leif and Girolami, Mark and van Risjbergen, Keith},
title = {Investigating the Relationship between Language Model Perplexity and IR Precision-Recall Measures},
year = {2003},
isbn = {1581136463},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/860435.860505},
doi = {10.1145/860435.860505},
abstract = {An empirical study has been conducted investigating the relationship between the performance of an aspect based language model in terms of perplexity and the corresponding information retrieval performance obtained. It is observed, on the corpora considered, that the perplexity of the language model has a systematic relationship with the achievable precision recall performance though it is not statistically significant.},
booktitle = {Proceedings of the 26th Annual International ACM SIGIR Conference on Research and Development in Informaion Retrieval},
pages = {369–370},
numpages = {2},
keywords = {language model},
location = {Toronto, Canada},
series = {SIGIR '03}
}
@misc{beltagy2020longformer,
title={{Longformer: The Long-Document Transformer}},
author={Iz Beltagy and Matthew E. Peters and Arman Cohan},
year={2020},
eprint={2004.05150},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@article{bertsch2023unlimiformer,
title={{Unlimiformer: Long-range transformers with unlimited length input}},
author={Bertsch, Amanda and Alon, Uri and Neubig, Graham and Gormley, Matthew R},
journal={arXiv preprint arXiv:2305.01625},
year={2023}
}
@article{roy2021efficient,
title={{Efficient content-based sparse attention with routing transformers}},
author={Roy, Aurko and Saffar, Mohammad and Vaswani, Ashish and Grangier, David},
journal={Transactions of the Association for Computational Linguistics},
volume={9},
pages={53--68},
year={2021},
publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…}
}
@misc{
dai*2019transformerxl,
title={Transformer-{XL}: Language Modeling with Longer-Term Dependency},
author={Zihang Dai* and Zhilin Yang* and Yiming Yang and William W. Cohen and Jaime Carbonell and Quoc V. Le and Ruslan Salakhutdinov},
year={2019},
url={https://openreview.net/forum?id=HJePno0cYm},
}
@inproceedings{
wu2022memorizing,
title={{Memorizing Transformers}},
author={Yuhuai Wu and Markus Norman Rabe and DeLesley Hutchins and Christian Szegedy},
booktitle={{International Conference on Learning Representations}},
year={2022},
url={https://openreview.net/forum?id=TrjbxzRcnf-}
}
@article{yi2023edgemoe,
title={{EdgeMoE: Fast On-Device Inference of MoE-based Large Language Models}},
author={Yi, Rongjie and Guo, Liwei and Wei, Shiyun and Zhou, Ao and Wang, Shangguang and Xu, Mengwei},
journal={arXiv preprint arXiv:2308.14352},
year={2023}
}
@article{miao2023specinfer,
title={{SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification}},
author={Miao, Xupeng and Oliaro, Gabriele and Zhang, Zhihao and Cheng, Xinhao and Wang, Zeyu and Wong, Rae Ying Yee and Chen, Zhuoming and Arfeen, Daiyaan and Abhyankar, Reyna and Jia, Zhihao},
journal={arXiv preprint arXiv:2305.09781},
year={2023}
}
@inproceedings{aminabadi2022deepspeed,
title={{DeepSpeed-inference: enabling efficient inference of transformer models at unprecedented scale}},
author={Aminabadi, Reza Yazdani and Rajbhandari, Samyam and Awan, Ammar Ahmad and Li, Cheng and Li, Du and Zheng, Elton and Ruwase, Olatunji and Smith, Shaden and Zhang, Minjia and Rasley, Jeff and others},
booktitle={{SC22: International Conference for High Performance Computing, Networking, Storage and Analysis}},
pages={1--15},
year={2022},
organization={IEEE}
}
@inproceedings{deepspeed,
author = {Rasley, Jeff and Rajbhandari, Samyam and Ruwase, Olatunji and He, Yuxiong},
title = {DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters},
year = {2020},
isbn = {9781450379984},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3394486.3406703},
doi = {10.1145/3394486.3406703},
abstract = {Explore new techniques in Microsoft's open source library called DeepSpeed, which advances large model training by improving scale, speed, cost, and usability, unlocking the ability to train 100-billion-parameter models. DeepSpeed is compatible with PyTorch. One piece of our library, called ZeRO, is a new parallelized optimizer that greatly reduces the resources needed for model and data parallelism while massively increasing the number of parameters that can be trained. Researchers have used these breakthroughs to create Turing Natural Language Generation (Turing-NLG), which at the time of its release was the largest publicly known language model at 17 billion parameters. In addition we will also go over our latest transformer kernel advancements that led the DeepSpeed team to achieve the world fastest BERT pretraining record.The Zero Redundancy Optimizer (ZeRO) is a novel memory optimization technology for large-scale distributed deep learning. ZeRO can train deep learning models with over 100 billion parameters on the current generation of GPU clusters at three to five times the throughput of the current best system. It also presents a clear path to training models with trillions of parameters, demonstrating an unprecedented leap in deep learning system technology.DeepSpeed brings state-of-the-art training techniques, such as ZeRO, optimized kernels, distributed training, mixed precision, and checkpointing, through lightweight APIs compatible with PyTorch. With just a few lines of code changes to your PyTorch model, you can leverage DeepSpeed to address underlying performance challenges and boost the speed and scale of your training.},
booktitle = {Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining},
pages = {3505–3506},
numpages = {2},
keywords = {machine learning, distributed deep learning},
location = {Virtual Event, CA, USA},
series = {KDD '20}
}
@article{shoeybi2019megatron,
title={{Megatron-lm: Training multi-billion parameter language models using model parallelism}},
author={Shoeybi, Mohammad and Patwary, Mostofa and Puri, Raul and LeGresley, Patrick and Casper, Jared and Catanzaro, Bryan},
journal={arXiv preprint arXiv:1909.08053},
year={2019}
}
@inproceedings{Leviathan2022FastIF,
title={{Fast Inference from Transformers via Speculative Decoding}},
author={Yaniv Leviathan and Matan Kalman and Y. Matias},
booktitle={{International Conference on Machine Learning}},
year={2022},
url={https://api.semanticscholar.org/CorpusID:254096365}
}
@article{agarwal2020accordion,
title={{Accordion: Adaptive gradient communication via critical learning regime identification}},
author={Agarwal, Saurabh and Wang, Hongyi and Lee, Kangwook and Venkataraman, Shivaram and Papailiopoulos, Dimitris},
journal={arXiv preprint arXiv:2010.16248},
year={2020}
}
@misc{zhao2016tensor,
title={{Tensor Ring Decomposition}},
author={Qibin Zhao and Guoxu Zhou and Shengli Xie and Liqing Zhang and Andrzej Cichocki},
year={2016},
eprint={1606.05535},
archivePrefix={arXiv},
primaryClass={cs.NA}
}
@article{tensor_decomp,
author = {Oseledets, I. V.},
title = {Tensor-Train Decomposition},
journal = {SIAM Journal on Scientific Computing},
volume = {33},
number = {5},
pages = {2295-2317},
year = {2011},
doi = {10.1137/090752286},
URL = {
https://doi.org/10.1137/090752286
},
eprint = {
https://doi.org/10.1137/090752286
}
}
@inproceedings{espresso,
author = {Wang, Zhuang and Lin, Haibin and Zhu, Yibo and Ng, T. S. Eugene},
title = {Hi-Speed DNN Training with Espresso: Unleashing the Full Potential of Gradient Compression with Near-Optimal Usage Strategies},
year = {2023},
isbn = {9781450394871},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3552326.3567505},
doi = {10.1145/3552326.3567505},
abstract = {Gradient compression (GC) is a promising approach to addressing the communication bottleneck in distributed deep learning (DDL). It saves the communication time, but also incurs additional computation overheads. The training throughput of compression-enabled DDL is determined by the compression strategy, including whether to compress each tensor, the type of compute resources (e.g., CPUs or GPUs) for compression, the communication schemes for compressed tensor, and so on. However, it is challenging to find the optimal compression strategy for applying GC to DDL because of the intricate interactions among tensors. To fully unleash the benefits of GC, two questions must be addressed: 1) How to express any compression strategies and the corresponding interactions among tensors of any DDL training job? 2) How to quickly select a near-optimal compression strategy?In this paper, we propose Espresso to answer these questions. It first designs a decision tree abstraction to express any compression strategies and develops empirical models to timeline tensor computation, communication, and compression to enable Espresso to derive the intricate interactions among tensors. It then designs a compression decision algorithm that analyzes tensor interactions to eliminate and prioritize strategies and optimally offloads compression from GPUs to CPUs. Experimental evaluations show that Espresso can improve the training throughput over the start-of-the-art compression-enabled system by up to 77\% for representative DDL training jobs. Moreover, the computational time needed to select the compression strategy is measured in milliseconds, and the selected strategy is only a few percent from optimal.},
booktitle = {Proceedings of the Eighteenth European Conference on Computer Systems},
pages = {867–882},
numpages = {16},
keywords = {systems for machine learning, distributed systems, gradient compression, DNN training},
location = {Rome, Italy},
series = {EuroSys '23}
}
@inproceedings{egeria,
author = {Wang, Yiding and Sun, Decang and Chen, Kai and Lai, Fan and Chowdhury, Mosharaf},
title = {Egeria: Efficient DNN Training with Knowledge-Guided Layer Freezing},
year = {2023},
isbn = {9781450394871},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3552326.3587451},
doi = {10.1145/3552326.3587451},
abstract = {Training deep neural networks (DNNs) is time-consuming. While most existing solutions try to overlap/schedule computation and communication for efficient training, this paper goes one step further by skipping computing and communication through DNN layer freezing. Our key insight is that the training progress of internal DNN layers differs significantly, and front layers often become well-trained much earlier than deep layers. To explore this, we first introduce the notion of training plasticity to quantify the training progress of internal DNN layers. Then we design Egeria, a knowledge-guided DNN training system that employs semantic knowledge from a reference model to accurately evaluate individual layers' training plasticity and safely freeze the converged ones, saving their corresponding backward computation and communication. Our reference model is generated on the fly using quantization techniques and runs forward operations asynchronously on available CPUs to minimize the overhead. In addition, Egeria caches the intermediate outputs of the frozen layers with prefetching to further skip the forward computation. Our implementation and testbed experiments with popular vision and language models show that Egeria achieves 19\%-43\% training speedup w.r.t. the state-of-the-art without sacrificing accuracy.},
booktitle = {Proceedings of the Eighteenth European Conference on Computer Systems},
pages = {851–866},
numpages = {16},
keywords = {layer freezing, machine learning training},
location = {Rome, Italy},
series = {EuroSys '23}
}
@article{rissanen1981efficient,
title={{Efficient arithmetic coding for data compression}},
author={Rissanen, Jorma and Langdon, G. G. Jr},
journal={IEEE transactions on Communications},
volume={29},
number={6},
pages={858--865},
year={1981},
publisher={IEEE}
}
@article{rissanen1976generalized,
title={{Generalized Kraft inequality and arithmetic coding}},
author={Rissanen, Jorma},
journal={IBM Journal of Research and Development},
volume={20},
number={3},
pages={198--203},
year={1976},
publisher={IBM}
}
@article{rissanen1979arithmetic,
title={{Arithmetic coding}},
author={Rissanen, Jorma},
journal={IBM Journal of Research and Development},
volume={23},
number={2},
pages={149--162},
year={1979},
publisher={IBM}
}
@article{ac,
author = {Witten, Ian H. and Neal, Radford M. and Cleary, John G.},
title = {Arithmetic Coding for Data Compression},
year = {1987},
issue_date = {June 1987},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {30},
number = {6},
issn = {0001-0782},
url = {https://doi.org/10.1145/214762.214771},
doi = {10.1145/214762.214771},
abstract = {The state of the art in data compression is arithmetic coding, not the better-known Huffman method. Arithmetic coding gives greater compression, is faster for adaptive models, and clearly separates the model from the channel encoding.},
journal = {Commun. ACM},
month = {jun},
pages = {520–540},
numpages = {21}
}
@inproceedings{smoothquant,
title={{Smoothquant: Accurate and efficient post-training quantization for large language models}},
author={Xiao, Guangxuan and Lin, Ji and Seznec, Mickael and Wu, Hao and Demouth, Julien and Han, Song},
booktitle={{International Conference on Machine Learning}},
pages={38087--38099},
year={2023},
organization={PMLR}
}
@article{flexgen,
title={{High-throughput generative inference of large language models with a single gpu}},
author={Sheng, Ying and Zheng, Lianmin and Yuan, Binhang and Li, Zhuohan and Ryabinin, Max and Fu, Daniel Y and Xie, Zhiqiang and Chen, Beidi and Barrett, Clark and Gonzalez, Joseph E and others},
journal={arXiv preprint arXiv:2303.06865},
year={2023}
}
@article{llmint8,
title={{Llm. int8 (): 8-bit matrix multiplication for transformers at scale}},
author={Dettmers, Tim and Lewis, Mike and Belkada, Younes and Zettlemoyer, Luke},
journal={arXiv preprint arXiv:2208.07339},
year={2022}
}
@misc{vaswani2023attention,
title={{Attention Is All You Need}},
author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
year={2023},
eprint={1706.03762},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@article{pope2023efficiently,
title={{Efficiently scaling transformer inference}},
author={Pope, Reiner and Douglas, Sholto and Chowdhery, Aakanksha and Devlin, Jacob and Bradbury, James and Heek, Jonathan and Xiao, Kefan and Agrawal, Shivani and Dean, Jeff},
journal={Proceedings of Machine Learning and Systems},
volume={5},
year={2023}
}
@article{ott2019fairseq,
title={{fairseq: A fast, extensible toolkit for sequence modeling}},
author={Ott, Myle and Edunov, Sergey and Baevski, Alexei and Fan, Angela and Gross, Sam and Ng, Nathan and Grangier, David and Auli, Michael},
journal={arXiv preprint arXiv:1904.01038},
year={2019}
}
@inproceedings{wolf-etal-2020-transformers,
title = "Transformers: State-of-the-Art Natural Language Processing",
author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = oct,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
pages = "38--45"
}
@article{dai2019transformer,
title={{Transformer-xl: Attentive language models beyond a fixed-length context}},
author={Dai, Zihang and Yang, Zhilin and Yang, Yiming and Carbonell, Jaime and Le, Quoc V and Salakhutdinov, Ruslan},
journal={arXiv preprint arXiv:1901.02860},
year={2019}
}
@article{shuster2021retrieval,
title={{Retrieval augmentation reduces hallucination in conversation}},
author={Shuster, Kurt and Poff, Spencer and Chen, Moya and Kiela, Douwe and Weston, Jason},
journal={arXiv preprint arXiv:2104.07567},
year={2021}
}
@article{milbauer2023lait,
title={{LAIT: Efficient Multi-Segment Encoding in Transformers with Layer-Adjustable Interaction}},
author={Milbauer, Jeremiah and Louis, Annie and Hosseini, Mohammad Javad and Fabrikant, Alex and Metzler, Donald and Schuster, Tal},
journal={arXiv preprint arXiv:2305.19585},
year={2023}
}
@article{komeili2021internet,
title={{Internet-augmented dialogue generation}},
author={Komeili, Mojtaba and Shuster, Kurt and Weston, Jason},
journal={arXiv preprint arXiv:2107.07566},
year={2021}
}
@article{mialon2023augmented,
title={{Augmented language models: a survey}},
author={Mialon, Gr{\'e}goire and Dess{\`\i}, Roberto and Lomeli, Maria and Nalmpantis, Christoforos and Pasunuru, Ram and Raileanu, Roberta and Rozi{\`e}re, Baptiste and Schick, Timo and Dwivedi-Yu, Jane and Celikyilmaz, Asli and others},
journal={arXiv preprint arXiv:2302.07842},
year={2023}
}
@article{izacard2020leveraging,
title={{Leveraging passage retrieval with generative models for open domain question answering}},
author={Izacard, Gautier and Grave, Edouard},
journal={arXiv preprint arXiv:2007.01282},
year={2020}
}
@article{li2022decoupled,
title={{Decoupled context processing for context augmented language modeling}},
author={Li, Zonglin and Guo, Ruiqi and Kumar, Sanjiv},
journal={arXiv preprint arXiv:2210.05758},
year={2022}
}
@article{izacard2020leveraging,
title={{Leveraging passage retrieval with generative models for open domain question answering}},
author={Izacard, Gautier and Grave, Edouard},
journal={arXiv preprint arXiv:2007.01282},
year={2020}
}
@article{mialon2023augmented,
title={{Augmented language models: a survey}},
author={Mialon, Gr{\'e}goire and Dess{\`\i}, Roberto and Lomeli, Maria and Nalmpantis, Christoforos and Pasunuru, Ram and Raileanu, Roberta and Rozi{\`e}re, Baptiste and Schick, Timo and Dwivedi-Yu, Jane and Celikyilmaz, Asli and others},
journal={arXiv preprint arXiv:2302.07842},
year={2023}
}
@article{komeili2021internet,
title={{Internet-augmented dialogue generation}},
author={Komeili, Mojtaba and Shuster, Kurt and Weston, Jason},
journal={arXiv preprint arXiv:2107.07566},
year={2021}
}
@article{shuster2021retrieval,
title={{Retrieval augmentation reduces hallucination in conversation}},
author={Shuster, Kurt and Poff, Spencer and Chen, Moya and Kiela, Douwe and Weston, Jason},
journal={arXiv preprint arXiv:2104.07567},
year={2021}
}
@article{milbauer2023lait,
title={{LAIT: Efficient Multi-Segment Encoding in Transformers with Layer-Adjustable Interaction}},
author={Milbauer, Jeremiah and Louis, Annie and Hosseini, Mohammad Javad and Fabrikant, Alex and Metzler, Donald and Schuster, Tal},
journal={arXiv preprint arXiv:2305.19585},
year={2023}
}
@misc{kim2023stack,
title={{Full Stack Optimization of Transformer Inference: a Survey}},
author={Sehoon Kim and Coleman Hooper and Thanakul Wattanawong and Minwoo Kang and Ruohan Yan and Hasan Genc and Grace Dinh and Qijing Huang and Kurt Keutzer and Michael W. Mahoney and Yakun Sophia Shao and Amir Gholami},
year={2023},
eprint={2302.14017},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{longchat2023,
title = {How Long Can Open-Source LLMs Truly Promise on Context Length?},
url = {https://lmsys.org/blog/2023-06-29-longchat},
author = {Dacheng Li* and Rulin Shao* and Anze Xie and Ying Sheng, Lianmin Zheng and Joseph E. Gonzalez and Ion Stoica and Xuezhe Ma and Hao Zhang},
month = {June},
year = {2023}
}
@misc{sun2021longrange,
title={{Do Long-Range Language Models Actually Use Long-Range Context?}},
author={Simeng Sun and Kalpesh Krishna and Andrew Mattarella-Micke and Mohit Iyyer},
year={2021},
eprint={2109.09115},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{liu2023lost,
title={{Lost in the Middle: How Language Models Use Long Contexts}},
author={Nelson F. Liu and Kevin Lin and John Hewitt and Ashwin Paranjape and Michele Bevilacqua and Fabio Petroni and Percy Liang},
year={2023},
eprint={2307.03172},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{johnson2017billionscale,
title={{Billion-scale similarity search with GPUs}},
author={Jeff Johnson and Matthijs Douze and Hervé Jégou},
year={2017},
eprint={1702.08734},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@article{2019t5,
author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
journal = {arXiv e-prints},
year = {2019},
archivePrefix = {arXiv},
eprint = {1910.10683},
}
@inproceedings{
zhang2023ho,
title={{H2O: Heavy-Hitter Oracle for Efficient Generative Inference of Large Language Models}},
author={Zhenyu Zhang and Ying Sheng and Tianyi Zhou and Tianlong Chen and Lianmin Zheng and Ruisi Cai and Zhao Song and Yuandong Tian and Christopher Re and Clark Barrett and Zhangyang Wang and Beidi Chen},
booktitle={{Workshop on Efficient Systems for Foundation Models @ ICML2023}},
year={2023},
url={https://openreview.net/forum?id=ctPizehA9D}
}
@misc{dejong2023fido,
title={{FiDO: Fusion-in-Decoder optimized for stronger performance and faster inference}},
author={Michiel de Jong and Yury Zemlyanskiy and Joshua Ainslie and Nicholas FitzGerald and Sumit Sanghai and Fei Sha and William Cohen},
year={2023},
eprint={2212.08153},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{mohtashami2023landmark,
title={{Landmark Attention: Random-Access Infinite Context Length for Transformers}},
author={Amirkeivan Mohtashami and Martin Jaggi},
year={2023},
eprint={2305.16300},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{
borzunov2023distributed,
title={{Distributed Inference and Fine-tuning of Large Language Models Over The Internet}},
author={Alexander Borzunov and Dmitry Baranchuk and Tim Dettmers and Max Ryabinin and Younes Belkada and Artem Chumachenko and Pavel Samygin and Colin Raffel},
year={2023},
url={https://openreview.net/forum?id=HLQyRgRnoXo}
}
@misc{ding2023selfagreement,
title={{Self-Agreement: A Framework for Fine-tuning Language Models to Find Agreement among Diverse Opinions}},
author={Shiyao Ding and Takayuki Ito},
year={2023},
eprint={2305.11460},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@article{lin2023speciality,
title={{Speciality vs Generality: An Empirical Study on Catastrophic Forgetting in Fine-tuning Foundation Models}},
author={Lin, Yong and Tan, Lu and Lin, Hangyu and Zheng, Zeming and Pi, Renjie and Zhang, Jipeng and Diao, Shizhe and Wang, Haoxiang and Zhao, Han and Yao, Yuan and others},
journal={arXiv preprint arXiv:2309.06256},
year={2023}
}
@article{47761,
title = {Natural Questions: a Benchmark for Question Answering Research},
author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},
year = {2019},
journal = {Transactions of the Association of Computational Linguistics}
}
@misc{agostinelli2023musiclm,
title={{MusicLM: Generating Music From Text}},
author={Andrea Agostinelli and Timo I. Denk and Zalán Borsos and Jesse Engel and Mauro Verzetti and Antoine Caillon and Qingqing Huang and Aren Jansen and Adam Roberts and Marco Tagliasacchi and Matt Sharifi and Neil Zeghidour and Christian Frank},
year={2023},
eprint={2301.11325},
archivePrefix={arXiv},
primaryClass={cs.SD}
}
@misc{chen2023extending,
title={{Extending Context Window of Large Language Models via Positional Interpolation}},
author={Shouyuan Chen and Sherman Wong and Liangjian Chen and Yuandong Tian},
year={2023},
eprint={2306.15595},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{kitaev2020reformer,
title={{Reformer: The Efficient Transformer}},
author={Nikita Kitaev and Łukasz Kaiser and Anselm Levskaya},
year={2020},
eprint={2001.04451},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{luo2021stable,
title={{Stable, Fast and Accurate: Kernelized Attention with Relative Positional Encoding}},
author={Shengjie Luo and Shanda Li and Tianle Cai and Di He and Dinglan Peng and Shuxin Zheng and Guolin Ke and Liwei Wang and Tie-Yan Liu},
year={2021},
eprint={2106.12566},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{workshop2023bloom,
title={{BLOOM: A 176B-Parameter Open-Access Multilingual Language Model}},
author={BigScience Workshop and : and Teven Le Scao and Angela Fan and Christopher Akiki and Ellie Pavlick and Suzana Ilić and Daniel Hesslow and Roman Castagné and Alexandra Sasha Luccioni and François Yvon and Matthias Gallé and Jonathan Tow and Alexander M. Rush and Stella Biderman and Albert Webson and Pawan Sasanka Ammanamanchi and Thomas Wang and Benoît Sagot and Niklas Muennighoff and Albert Villanova del Moral and Olatunji Ruwase and Rachel Bawden and Stas Bekman and Angelina McMillan-Major and Iz Beltagy and Huu Nguyen and Lucile Saulnier and Samson Tan and Pedro Ortiz Suarez and Victor Sanh and Hugo Laurençon and Yacine Jernite and Julien Launay and Margaret Mitchell and Colin Raffel and Aaron Gokaslan and Adi Simhi and Aitor Soroa and Alham Fikri Aji and Amit Alfassy and Anna Rogers and Ariel Kreisberg Nitzav and Canwen Xu and Chenghao Mou and Chris Emezue and Christopher Klamm and Colin Leong and Daniel van Strien and David Ifeoluwa Adelani and Dragomir Radev and Eduardo González Ponferrada and Efrat Levkovizh and Ethan Kim and Eyal Bar Natan and Francesco De Toni and Gérard Dupont and Germán Kruszewski and Giada Pistilli and Hady Elsahar and Hamza Benyamina and Hieu Tran and Ian Yu and Idris Abdulmumin and Isaac Johnson and Itziar Gonzalez-Dios and Javier de la Rosa and Jenny Chim and Jesse Dodge and Jian Zhu and Jonathan Chang and Jörg Frohberg and Joseph Tobing and Joydeep Bhattacharjee and Khalid Almubarak and Kimbo Chen and Kyle Lo and Leandro Von Werra and Leon Weber and Long Phan and Loubna Ben allal and Ludovic Tanguy and Manan Dey and Manuel Romero Muñoz and Maraim Masoud and María Grandury and Mario Šaško and Max Huang and Maximin Coavoux and Mayank Singh and Mike Tian-Jian Jiang and Minh Chien Vu and Mohammad A. Jauhar and Mustafa Ghaleb and Nishant Subramani and Nora Kassner and Nurulaqilla Khamis and Olivier Nguyen and Omar Espejel and Ona de Gibert and Paulo Villegas and Peter Henderson and Pierre Colombo and Priscilla Amuok and Quentin Lhoest and Rheza Harliman and Rishi Bommasani and Roberto Luis López and Rui Ribeiro and Salomey Osei and Sampo Pyysalo and Sebastian Nagel and Shamik Bose and Shamsuddeen Hassan Muhammad and Shanya Sharma and Shayne Longpre and Somaieh Nikpoor and Stanislav Silberberg and Suhas Pai and Sydney Zink and Tiago Timponi Torrent and Timo Schick and Tristan Thrush and Valentin Danchev and Vassilina Nikoulina and Veronika Laippala and Violette Lepercq and Vrinda Prabhu and Zaid Alyafeai and Zeerak Talat and Arun Raja and Benjamin Heinzerling and Chenglei Si and Davut Emre Taşar and Elizabeth Salesky and Sabrina J. Mielke and Wilson Y. Lee and Abheesht Sharma and Andrea Santilli and Antoine Chaffin and Arnaud Stiegler and Debajyoti Datta and Eliza Szczechla and Gunjan Chhablani and Han Wang and Harshit Pandey and Hendrik Strobelt and Jason Alan Fries and Jos Rozen and Leo Gao and Lintang Sutawika and M Saiful Bari and Maged S. Al-shaibani and Matteo Manica and Nihal Nayak and Ryan Teehan and Samuel Albanie and Sheng Shen and Srulik Ben-David and Stephen H. Bach and Taewoon Kim and Tali Bers and Thibault Fevry and Trishala Neeraj and Urmish Thakker and Vikas Raunak and Xiangru Tang and Zheng-Xin Yong and Zhiqing Sun and Shaked Brody and Yallow Uri and Hadar Tojarieh and Adam Roberts and Hyung Won Chung and Jaesung Tae and Jason Phang and Ofir Press and Conglong Li and Deepak Narayanan and Hatim Bourfoune and Jared Casper and Jeff Rasley and Max Ryabinin and Mayank Mishra and Minjia Zhang and Mohammad Shoeybi and Myriam Peyrounette and Nicolas Patry and Nouamane Tazi and Omar Sanseviero and Patrick von Platen and Pierre Cornette and Pierre François Lavallée and Rémi Lacroix and Samyam Rajbhandari and Sanchit Gandhi and Shaden Smith and Stéphane Requena and Suraj Patil and Tim Dettmers and Ahmed Baruwa and Amanpreet Singh and Anastasia Cheveleva and Anne-Laure Ligozat and Arjun Subramonian and Aurélie Névéol and Charles Lovering and Dan Garrette and Deepak Tunuguntla and Ehud Reiter and Ekaterina Taktasheva and Ekaterina Voloshina and Eli Bogdanov and Genta Indra Winata and Hailey Schoelkopf and Jan-Christoph Kalo and Jekaterina Novikova and Jessica Zosa Forde and Jordan Clive and Jungo Kasai and Ken Kawamura and Liam Hazan and Marine Carpuat and Miruna Clinciu and Najoung Kim and Newton Cheng and Oleg Serikov and Omer Antverg and Oskar van der Wal and Rui Zhang and Ruochen Zhang and Sebastian Gehrmann and Shachar Mirkin and Shani Pais and Tatiana Shavrina and Thomas Scialom and Tian Yun and Tomasz Limisiewicz and Verena Rieser and Vitaly Protasov and Vladislav Mikhailov and Yada Pruksachatkun and Yonatan Belinkov and Zachary Bamberger and Zdeněk Kasner and Alice Rueda and Amanda Pestana and Amir Feizpour and Ammar Khan and Amy Faranak and Ana Santos and Anthony Hevia and Antigona Unldreaj and Arash Aghagol and Arezoo Abdollahi and Aycha Tammour and Azadeh HajiHosseini and Bahareh Behroozi and Benjamin Ajibade and Bharat Saxena and Carlos Muñoz Ferrandis and Daniel McDuff and Danish Contractor and David Lansky and Davis David and Douwe Kiela and Duong A. Nguyen and Edward Tan and Emi Baylor and Ezinwanne Ozoani and Fatima Mirza and Frankline Ononiwu and Habib Rezanejad and Hessie Jones and Indrani Bhattacharya and Irene Solaiman and Irina Sedenko and Isar Nejadgholi and Jesse Passmore and Josh Seltzer and Julio Bonis Sanz and Livia Dutra and Mairon Samagaio and Maraim Elbadri and Margot Mieskes and Marissa Gerchick and Martha Akinlolu and Michael McKenna and Mike Qiu and Muhammed Ghauri and Mykola Burynok and Nafis Abrar and Nazneen Rajani and Nour Elkott and Nour Fahmy and Olanrewaju Samuel and Ran An and Rasmus Kromann and Ryan Hao and Samira Alizadeh and Sarmad Shubber and Silas Wang and Sourav Roy and Sylvain Viguier and Thanh Le and Tobi Oyebade and Trieu Le and Yoyo Yang and Zach Nguyen and Abhinav Ramesh Kashyap and Alfredo Palasciano and Alison Callahan and Anima Shukla and Antonio Miranda-Escalada and Ayush Singh and Benjamin Beilharz and Bo Wang and Caio Brito and Chenxi Zhou and Chirag Jain and Chuxin Xu and Clémentine Fourrier and Daniel León Periñán and Daniel Molano and Dian Yu and Enrique Manjavacas and Fabio Barth and Florian Fuhrimann and Gabriel Altay and Giyaseddin Bayrak and Gully Burns and Helena U. Vrabec and Imane Bello and Ishani Dash and Jihyun Kang and John Giorgi and Jonas Golde and Jose David Posada and Karthik Rangasai Sivaraman and Lokesh Bulchandani and Lu Liu and Luisa Shinzato and Madeleine Hahn de Bykhovetz and Maiko Takeuchi and Marc Pàmies and Maria A Castillo and Marianna Nezhurina and Mario Sänger and Matthias Samwald and Michael Cullan and Michael Weinberg and Michiel De Wolf and Mina Mihaljcic and Minna Liu and Moritz Freidank and Myungsun Kang and Natasha Seelam and Nathan Dahlberg and Nicholas Michio Broad and Nikolaus Muellner and Pascale Fung and Patrick Haller and Ramya Chandrasekhar and Renata Eisenberg and Robert Martin and Rodrigo Canalli and Rosaline Su and Ruisi Su and Samuel Cahyawijaya and Samuele Garda and Shlok S Deshmukh and Shubhanshu Mishra and Sid Kiblawi and Simon Ott and Sinee Sang-aroonsiri and Srishti Kumar and Stefan Schweter and Sushil Bharati and Tanmay Laud and Théo Gigant and Tomoya Kainuma and Wojciech Kusa and Yanis Labrak and Yash Shailesh Bajaj and Yash Venkatraman and Yifan Xu and Yingxin Xu and Yu Xu and Zhe Tan and Zhongli Xie and Zifan Ye and Mathilde Bras and Younes Belkada and Thomas Wolf},
year={2023},
eprint={2211.05100},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{raffel2023exploring,
title={{Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer}},
author={Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
year={2023},
eprint={1910.10683},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{touvron2023llama,
title={{LLaMA: Open and Efficient Foundation Language Models}},
author={Hugo Touvron and Thibaut Lavril and Gautier Izacard and Xavier Martinet and Marie-Anne Lachaux and Timothée Lacroix and Baptiste Rozière and Naman Goyal and Eric Hambro and Faisal Azhar and Aurelien Rodriguez and Armand Joulin and Edouard Grave and Guillaume Lample},
year={2023},
eprint={2302.13971},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{zhang2022opt,
title={{OPT: Open Pre-trained Transformer Language Models}},
author={Susan Zhang and Stephen Roller and Naman Goyal and Mikel Artetxe and Moya Chen and Shuohui Chen and Christopher Dewan and Mona Diab and Xian Li and Xi Victoria Lin and Todor Mihaylov and Myle Ott and Sam Shleifer and Kurt Shuster and Daniel Simig and Punit Singh Koura and Anjali Sridhar and Tianlu Wang and Luke Zettlemoyer},
year={2022},
eprint={2205.01068},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@inproceedings{zhai2022scaling,
title={{Scaling vision transformers}},
author={Zhai, Xiaohua and Kolesnikov, Alexander and Houlsby, Neil and Beyer, Lucas},
booktitle={{Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}},
pages={12104--12113},
year={2022}
}
@article{kasneci2023chatgpt,
title={{ChatGPT for good? On opportunities and challenges of large language models for education}},
author={Kasneci, Enkelejda and Se{\ss}ler, Kathrin and K{\"u}chemann, Stefan and Bannert, Maria and Dementieva, Daryna and Fischer, Frank and Gasser, Urs and Groh, Georg and G{\"u}nnemann, Stephan and H{\"u}llermeier, Eyke and others},
journal={Learning and individual differences},
volume={103},
pages={102274},
year={2023},
publisher={Elsevier}
}
@misc{google_bard,
title = {Bard - Chat Based AI Tool from Google, Powered by PaLM 2},
howpublished = {\url{https://bard.google.com/}},
note = {Accessed: September 21st, 2023}
}
@misc{microsoft_bing,
title = {Bing},
author = {Microsoft Corporation},
year = {2009},
howpublished = {\url{https://www.bing.com/}},
}
@misc{bisk2019piqa,
title={{PIQA: Reasoning about Physical Commonsense in Natural Language}},
author={Yonatan Bisk and Rowan Zellers and Ronan Le Bras and Jianfeng Gao and Yejin Choi},
year={2019},
eprint={1911.11641},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{adiwardana2020humanlike,
title={{Towards a Human-like Open-Domain Chatbot}},
author={Daniel Adiwardana and Minh-Thang Luong and David R. So and Jamie Hall and Noah Fiedel and Romal Thoppilan and Zi Yang and Apoorv Kulshreshtha and Gaurav Nemade and Yifeng Lu and Quoc V. Le},
year={2020},
eprint={2001.09977},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{openai2023gpt4,
title={{GPT-4 Technical Report}},
author={OpenAI},
year={2023},
eprint={2303.08774},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{wu2023fast,
title={{Fast Distributed Inference Serving for Large Language Models}},
author={Bingyang Wu and Yinmin Zhong and Zili Zhang and Gang Huang and Xuanzhe Liu and Xin Jin},
year={2023},
eprint={2305.05920},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{pope2022efficiently,
title={{Efficiently Scaling Transformer Inference}},
author={Reiner Pope and Sholto Douglas and Aakanksha Chowdhery and Jacob Devlin and James Bradbury and Anselm Levskaya and Jonathan Heek and Kefan Xiao and Shivani Agrawal and Jeff Dean},
year={2022},
eprint={2211.05102},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{dao2022flashattention,
title={{FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness}},
author={Tri Dao and Daniel Y. Fu and Stefano Ermon and Atri Rudra and Christopher Ré},
year={2022},
eprint={2205.14135},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{narayanan2023cheaply,
title={{Cheaply Evaluating Inference Efficiency Metrics for Autoregressive Transformer APIs}},
author={Deepak Narayanan and Keshav Santhanam and Peter Henderson and Rishi Bommasani and Tony Lee and Percy Liang},
year={2023},
eprint={2305.02440},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{FasterTransformer,
author = {NVIDIA},
title = {FasterTransformer},
year = {2019},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/NVIDIA/FasterTransformer.git}},
commit = {c6e8f60}
}
@misc{anagnostidis2023dynamic,
title={{Dynamic Context Pruning for Efficient and Interpretable Autoregressive Transformers}},
author={Sotiris Anagnostidis and Dario Pavllo and Luca Biggio and Lorenzo Noci and Aurelien Lucchi and Thomas Hofmann},
year={2023},
eprint={2305.15805},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@inproceedings{shepherd,
title={$\{$SHEPHERD$\}$: Serving $\{$DNNs$\}$ in the Wild},
author={Zhang, Hong and Tang, Yupeng and Khandelwal, Anurag and Stoica, Ion},
booktitle={{20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)}},
pages={787--808},
year={2023}
}
@inproceedings{clockwork,
title={Serving $\{$DNNs$\}$ like clockwork: Performance predictability from the bottom up},
author={Gujarati, Arpan and Karimi, Reza and Alzayat, Safya and Hao, Wei and Kaufmann, Antoine and Vigfusson, Ymir and Mace, Jonathan},
booktitle={{14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)}},
pages={443--462},
year={2020}
}
@inproceedings{orca,
title={Orca: A distributed serving system for $\{$Transformer-Based$\}$ generative models},
author={Yu, Gyeong-In and Jeong, Joo Seong and Kim, Geon-Woo and Kim, Soojeong and Chun, Byung-Gon},
booktitle={{16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)}},
pages={521--538},
year={2022}
}
@article{efficiently-scaling,
title={{Efficiently scaling transformer inference}},
author={Pope, Reiner and Douglas, Sholto and Chowdhery, Aakanksha and Devlin, Jacob and Bradbury, James and Heek, Jonathan and Xiao, Kefan and Agrawal, Shivani and Dean, Jeff},
journal={Proceedings of Machine Learning and Systems},
volume={5},
year={2023}
}
@misc{llm-app-1,
author = {},
title = {Applications of Large Language Models - InData Labs},
howpublished = {\url{https://indatalabs.com/blog/large-language-model-apps}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}
@misc{llm-app-2,
author = {},
title = {12 Practical Large Language Model (LLM) Applications - Techopedia},
howpublished = {\url{https://www.techopedia.com/12-practical-large-language-model-llm-applications}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}
@misc{llm-app-3,
author = {},
title = {7 Top Large Language Model Use Cases And Applications},
howpublished = {\url{https://www.projectpro.io/article/large-language-model-use-cases-and-applications/887}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}
@misc{llm-app-4,
author = {},
title = {Real-World Use Cases for Large Language Models (LLMs) | by CellStrat | Medium},
howpublished = {\url{https://cellstrat.medium.com/real-world-use-cases-for-large-language-models-llms-d71c3a577bf2}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}
@misc{gpt4-api,
author = {},
title = {GPT-4 API general availability and deprecation of older models in the Completions API},
howpublished = {\url{https://openai.com/blog/gpt-4-api-general-availability}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}
@misc{langchain,
author = {},
title = {langchain-ai/langchain:Building applications with LLMs through composability},
howpublished = {\url{https://github.com/langchain-ai/langchain}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}
@misc{llama,
author = {},
title = {[2302.13971] LLaMA: Open and Efficient Foundation Language Models},
howpublished = {\url{https://arxiv.org/abs/2302.13971}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}
@misc{long-context,
author = {},
title = {Anthropic \textbackslash{} Introducing 100K Context Windows},
howpublished = {\url{https://www.anthropic.com/index/100k-context-windows}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}
@article{arxiv-1,
title={{Lost in the middle: How language models use long contexts}},
author={Liu, Nelson F and Lin, Kevin and Hewitt, John and Paranjape, Ashwin and Bevilacqua, Michele and Petroni, Fabio and Liang, Percy},
journal={arXiv preprint arXiv:2307.03172},
year={2023}
}
@article{arxiv-2,
title={{Do long-range language models actually use long-range context?}},
author={Sun, Simeng and Krishna, Kalpesh and Mattarella-Micke, Andrew and Iyyer, Mohit},
journal={arXiv preprint arXiv:2109.09115},
year={2021}
}
@misc{langchain-retrival,
author = {},
title = {Store and reference chat history | Langchain},
howpublished = {\url{https://python.langchain.com/docs/use_cases/question_answering/how_to/chat_vector_db}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}
@misc{arxiv-information-retrieval,
author = {},
title = {2112.04426.pdf},
howpublished = {\url{https://arxiv.org/pdf/2112.04426.pdf}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}
@misc{generative-agents,
author = {},
title = {[2304.03442] Generative Agents: Interactive Simulacra of Human Behavior},
howpublished = {\url{https://arxiv.org/abs/2304.03442}},
month = {},