temp.bib

@article{yin2021tt,
  title={{Tt-rec: Tensor train compression for deep learning recommendation models}},
  author={Yin, Chunxing and Acun, Bilge and Wu, Carole-Jean and Liu, Xing},
  journal={Proceedings of Machine Learning and Systems},
  volume={3},
  pages={448--462},
  year={2021}
}
@misc{KelSolaar2023,
  author = {KelSolaar},
  title = {fvvt-kels-utilities},
  year = {2023},
  note = {GitHub repository},
  howpublished = {\url{https://github.com/KelSolaar/fvvt-kels-utilities.git}}
}

@inproceedings{mentzer2019practical,
    Author = {Mentzer, Fabian and Agustsson, Eirikur and Tschannen, Michael and Timofte, Radu and Van Gool, Luc},
    Booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
    Title = {Practical Full Resolution Learned Lossless Image Compression},
    Year = {2019}}
@misc{brown2020language,
      title={{Language Models are Few-Shot Learners}}, 
      author={Tom B. Brown and Benjamin Mann and Nick Ryder and Melanie Subbiah and Jared Kaplan and Prafulla Dhariwal and Arvind Neelakantan and Pranav Shyam and Girish Sastry and Amanda Askell and Sandhini Agarwal and Ariel Herbert-Voss and Gretchen Krueger and Tom Henighan and Rewon Child and Aditya Ramesh and Daniel M. Ziegler and Jeffrey Wu and Clemens Winter and Christopher Hesse and Mark Chen and Eric Sigler and Mateusz Litwin and Scott Gray and Benjamin Chess and Jack Clark and Christopher Berner and Sam McCandlish and Alec Radford and Ilya Sutskever and Dario Amodei},
      year={2020},
      eprint={2005.14165},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@online{MosaicML2023Introducing,
    author    = {MosaicML NLP Team},
    title     = {Introducing MPT-7B: A New Standard for Open-Source,
    Commercially Usable LLMs},
    year      = {2023},
    url       = {www.mosaicml.com/blog/mpt-7b},
    note      = {Accessed: 2023-05-05},
    urldate   = {2023-05-05}
}
@misc{wikitext,
      title={{Pointer Sentinel Mixture Models}},
      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
      year={2016},
      eprint={1609.07843},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{zheng2023judging,
      title={{Judging LLM-as-a-judge with MT-Bench and Chatbot Arena}},
      author={Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric. P Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
      year={2023},
      eprint={2306.05685},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@article{atlas,
  title={{Few-shot learning with retrieval augmented language models}},
  author={Izacard, Gautier and Lewis, Patrick and Lomeli, Maria and Hosseini, Lucas and Petroni, Fabio and Schick, Timo and Dwivedi-Yu, Jane and Joulin, Armand and Riedel, Sebastian and Grave, Edouard},
  journal={arXiv preprint arXiv:2208.03299},
  year={2022}
}
@misc{ram2023incontext,
      title={{In-Context Retrieval-Augmented Language Models}}, 
      author={Ori Ram and Yoav Levine and Itay Dalmedigos and Dor Muhlgay and Amnon Shashua and Kevin Leyton-Brown and Yoav Shoham},
      year={2023},
      eprint={2302.00083},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@article{liu2023scissorhands,
  title={{Scissorhands: Exploiting the Persistence of Importance Hypothesis for LLM KV Cache Compression at Test Time}},
  author={Liu, Zichang and Desai, Aditya and Liao, Fangshuo and Wang, Weitao and Xie, Victor and Xu, Zhaozhuo and Kyrillidis, Anastasios and Shrivastava, Anshumali},
  journal={arXiv preprint arXiv:2305.17118},
  year={2023}
}
@misc{fid,
      title={{Leveraging Passage Retrieval with Generative Models for Open Domain Question Answering}}, 
      author={Gautier Izacard and Edouard Grave},
      year={2021},
      eprint={2007.01282},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@misc{izacard2020memory,
      title={{A Memory Efficient Baseline for Open Domain Question Answering}}, 
      author={Gautier Izacard and Fabio Petroni and Lucas Hosseini and Nicola De Cao and Sebastian Riedel and Edouard Grave},
      year={2020},
      eprint={2012.15156},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@misc{ainslie2020etc,
      title={{ETC: Encoding Long and Structured Inputs in Transformers}}, 
      author={Joshua Ainslie and Santiago Ontanon and Chris Alberti and Vaclav Cvicek and Zachary Fisher and Philip Pham and Anirudh Ravula and Sumit Sanghai and Qifan Wang and Li Yang},
      year={2020},
      eprint={2004.08483},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{zaheer2021big,
      title={{Big Bird: Transformers for Longer Sequences}}, 
      author={Manzil Zaheer and Guru Guruganesh and Avinava Dubey and Joshua Ainslie and Chris Alberti and Santiago Ontanon and Philip Pham and Anirudh Ravula and Qifan Wang and Li Yang and Amr Ahmed},
      year={2021},
      eprint={2007.14062},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}
@article{lewis2020retrieval,
  title={{Retrieval-augmented generation for knowledge-intensive nlp tasks}},
  author={Lewis, Patrick and Perez, Ethan and Piktus, Aleksandra and Petroni, Fabio and Karpukhin, Vladimir and Goyal, Naman and K{\"u}ttler, Heinrich and Lewis, Mike and Yih, Wen-tau and Rockt{\"a}schel, Tim and others},
  journal={Advances in Neural Information Processing Systems},
  volume={33},
  pages={9459--9474},
  year={2020}
}

@article{rubin2023long,
  title={{Long-range Language Modeling with Self-retrieval}},
  author={Rubin, Ohad and Berant, Jonathan},
  journal={arXiv preprint arXiv:2306.13421},
  year={2023}
}
@misc{ding2023longnet,
      title={{LongNet: Scaling Transformers to 1,000,000,000 Tokens}}, 
      author={Jiayu Ding and Shuming Ma and Li Dong and Xingxing Zhang and Shaohan Huang and Wenhui Wang and Nanning Zheng and Furu Wei},
      year={2023},
      eprint={2307.02486},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@InProceedings{pmlr-v162-hawthorne22a,
  title = 	 {General-purpose, long-context autoregressive modeling with Perceiver {AR}},
  author =       {Hawthorne, Curtis and Jaegle, Andrew and Cangea, C{\u{a}}t{\u{a}}lina and Borgeaud, Sebastian and Nash, Charlie and Malinowski, Mateusz and Dieleman, Sander and Vinyals, Oriol and Botvinick, Matthew and Simon, Ian and Sheahan, Hannah and Zeghidour, Neil and Alayrac, Jean-Baptiste and Carreira, Joao and Engel, Jesse},
  booktitle = 	 {Proceedings of the 39th International Conference on Machine Learning},
  pages = 	 {8535--8558},
  year = 	 {2022},
  editor = 	 {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},
  volume = 	 {162},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {17--23 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://proceedings.mlr.press/v162/hawthorne22a/hawthorne22a.pdf},
  url = 	 {https://proceedings.mlr.press/v162/hawthorne22a.html},
  abstract = 	 {Real-world data is high-dimensional: a book, image, or musical performance can easily contain hundreds of thousands of elements even after compression. However, the most commonly used autoregressive models, Transformers, are prohibitively expensive to scale to the number of inputs and layers needed to capture this long-range structure. We develop Perceiver AR, an autoregressive, modality-agnostic architecture which uses cross-attention to map long-range inputs to a small number of latents while also maintaining end-to-end causal masking. Perceiver AR can directly attend to over a hundred thousand tokens, enabling practical long-context density estimation without the need for hand-crafted sparsity patterns or memory mechanisms. When trained on images or music, Perceiver AR generates outputs with clear long-term coherence and structure. Our architecture also obtains state-of-the-art likelihood on long-sequence benchmarks, including 64x64 ImageNet images and PG-19 books.}
}


@misc{retro,
      title={{Improving language models by retrieving from trillions of tokens}}, 
      author={Sebastian Borgeaud and Arthur Mensch and Jordan Hoffmann and Trevor Cai and Eliza Rutherford and Katie Millican and George van den Driessche and Jean-Baptiste Lespiau and Bogdan Damoc and Aidan Clark and Diego de Las Casas and Aurelia Guy and Jacob Menick and Roman Ring and Tom Hennigan and Saffron Huang and Loren Maggiore and Chris Jones and Albin Cassirer and Andy Brock and Michela Paganini and Geoffrey Irving and Oriol Vinyals and Simon Osindero and Karen Simonyan and Jack W. Rae and Erich Elsen and Laurent Sifre},
      year={2022},
      eprint={2112.04426},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@inproceedings{MLSYS2022_773862fc,
 author = {Agarwal, Saurabh and Wang, Hongyi and Venkataraman, Shivaram and Papailiopoulos, Dimitris},
 booktitle = {Proceedings of Machine Learning and Systems},
 editor = {D. Marculescu and Y. Chi and C. Wu},
 pages = {652--672},
 title = {On the Utility of Gradient Compression in Distributed Training Systems},
 url = {https://proceedings.mlsys.org/paper_files/paper/2022/file/773862fcc2e29f650d68960ba5bd1101-Paper.pdf},
 volume = {4},
 year = {2022}
}
@misc{bernstein2018signsgd,
      title={{signSGD: Compressed Optimisation for Non-Convex Problems}}, 
      author={Jeremy Bernstein and Yu-Xiang Wang and Kamyar Azizzadenesheli and Anima Anandkumar},
      year={2018},
      eprint={1802.04434},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}
@article{shi2023replug,
  title={{Replug: Retrieval-augmented black-box language models}},
  author={Shi, Weijia and Min, Sewon and Yasunaga, Michihiro and Seo, Minjoon and James, Rich and Lewis, Mike and Zettlemoyer, Luke and Yih, Wen-tau},
  journal={arXiv preprint arXiv:2301.12652},
  year={2023}
}
@misc{usagereport,
  author = {Author's Name},
  title = {We Analyzed Millions of ChatGPT User Sessions: Visits are Down 29\% Since May; Programming Assistance is 30\% of Use},
  year = {2023}, 
  note = {SparkToro Blog},
  url = {https://sparktoro.com/blog/we-analyzed-millions-of-chatgpt-user-sessions-visits-are-down-29-since-may-programming-assistance-is-30-of-use/}
}

@article{ppl2,
author = "Stanley F Chen and Douglas  Beeferman and Roni  Rosenfeld",
title = "{Evaluation Metrics For Language Models}",
year = "2008",
month = "1",
url = "https://kilthub.cmu.edu/articles/journal_contribution/Evaluation_Metrics_For_Language_Models/6605324",
doi = "10.1184/R1/6605324.v1"
}
@inproceedings{ppl1,
author = {Azzopardi, Leif and Girolami, Mark and van Risjbergen, Keith},
title = {Investigating the Relationship between Language Model Perplexity and IR Precision-Recall Measures},
year = {2003},
isbn = {1581136463},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/860435.860505},
doi = {10.1145/860435.860505},
abstract = {An empirical study has been conducted investigating the relationship between the performance of an aspect based language model in terms of perplexity and the corresponding information retrieval performance obtained. It is observed, on the corpora considered, that the perplexity of the language model has a systematic relationship with the achievable precision recall performance though it is not statistically significant.},
booktitle = {Proceedings of the 26th Annual International ACM SIGIR Conference on Research and Development in Informaion Retrieval},
pages = {369–370},
numpages = {2},
keywords = {language model},
location = {Toronto, Canada},
series = {SIGIR '03}
}


@misc{beltagy2020longformer,
      title={{Longformer: The Long-Document Transformer}}, 
      author={Iz Beltagy and Matthew E. Peters and Arman Cohan},
      year={2020},
      eprint={2004.05150},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@article{bertsch2023unlimiformer,
  title={{Unlimiformer: Long-range transformers with unlimited length input}},
  author={Bertsch, Amanda and Alon, Uri and Neubig, Graham and Gormley, Matthew R},
  journal={arXiv preprint arXiv:2305.01625},
  year={2023}
}

@article{roy2021efficient,
  title={{Efficient content-based sparse attention with routing transformers}},
  author={Roy, Aurko and Saffar, Mohammad and Vaswani, Ashish and Grangier, David},
  journal={Transactions of the Association for Computational Linguistics},
  volume={9},
  pages={53--68},
  year={2021},
  publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…}
}
@misc{
dai*2019transformerxl,
title={Transformer-{XL}: Language Modeling with Longer-Term Dependency},
author={Zihang Dai* and Zhilin Yang* and Yiming Yang and William W. Cohen and Jaime Carbonell and Quoc V. Le and Ruslan Salakhutdinov},
year={2019},
url={https://openreview.net/forum?id=HJePno0cYm},
}

@inproceedings{
wu2022memorizing,
title={{Memorizing Transformers}},
author={Yuhuai Wu and Markus Norman Rabe and DeLesley Hutchins and Christian Szegedy},
booktitle={{International Conference on Learning Representations}},
year={2022},
url={https://openreview.net/forum?id=TrjbxzRcnf-}
}
@article{yi2023edgemoe,
  title={{EdgeMoE: Fast On-Device Inference of MoE-based Large Language Models}},
  author={Yi, Rongjie and Guo, Liwei and Wei, Shiyun and Zhou, Ao and Wang, Shangguang and Xu, Mengwei},
  journal={arXiv preprint arXiv:2308.14352},
  year={2023}
}


@article{miao2023specinfer,
  title={{SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification}},
  author={Miao, Xupeng and Oliaro, Gabriele and Zhang, Zhihao and Cheng, Xinhao and Wang, Zeyu and Wong, Rae Ying Yee and Chen, Zhuoming and Arfeen, Daiyaan and Abhyankar, Reyna and Jia, Zhihao},
  journal={arXiv preprint arXiv:2305.09781},
  year={2023}
}

@inproceedings{aminabadi2022deepspeed,
  title={{DeepSpeed-inference: enabling efficient inference of transformer models at unprecedented scale}},
  author={Aminabadi, Reza Yazdani and Rajbhandari, Samyam and Awan, Ammar Ahmad and Li, Cheng and Li, Du and Zheng, Elton and Ruwase, Olatunji and Smith, Shaden and Zhang, Minjia and Rasley, Jeff and others},
  booktitle={{SC22: International Conference for High Performance Computing, Networking, Storage and Analysis}},
  pages={1--15},
  year={2022},
  organization={IEEE}
}


@inproceedings{deepspeed,
author = {Rasley, Jeff and Rajbhandari, Samyam and Ruwase, Olatunji and He, Yuxiong},
title = {DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters},
year = {2020},
isbn = {9781450379984},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3394486.3406703},
doi = {10.1145/3394486.3406703},
abstract = {Explore new techniques in Microsoft's open source library called DeepSpeed, which advances large model training by improving scale, speed, cost, and usability, unlocking the ability to train 100-billion-parameter models. DeepSpeed is compatible with PyTorch. One piece of our library, called ZeRO, is a new parallelized optimizer that greatly reduces the resources needed for model and data parallelism while massively increasing the number of parameters that can be trained. Researchers have used these breakthroughs to create Turing Natural Language Generation (Turing-NLG), which at the time of its release was the largest publicly known language model at 17 billion parameters. In addition we will also go over our latest transformer kernel advancements that led the DeepSpeed team to achieve the world fastest BERT pretraining record.The Zero Redundancy Optimizer (ZeRO) is a novel memory optimization technology for large-scale distributed deep learning. ZeRO can train deep learning models with over 100 billion parameters on the current generation of GPU clusters at three to five times the throughput of the current best system. It also presents a clear path to training models with trillions of parameters, demonstrating an unprecedented leap in deep learning system technology.DeepSpeed brings state-of-the-art training techniques, such as ZeRO, optimized kernels, distributed training, mixed precision, and checkpointing, through lightweight APIs compatible with PyTorch. With just a few lines of code changes to your PyTorch model, you can leverage DeepSpeed to address underlying performance challenges and boost the speed and scale of your training.},
booktitle = {Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining},
pages = {3505–3506},
numpages = {2},
keywords = {machine learning, distributed deep learning},
location = {Virtual Event, CA, USA},
series = {KDD '20}
}


@article{shoeybi2019megatron,
  title={{Megatron-lm: Training multi-billion parameter language models using model parallelism}},
  author={Shoeybi, Mohammad and Patwary, Mostofa and Puri, Raul and LeGresley, Patrick and Casper, Jared and Catanzaro, Bryan},
  journal={arXiv preprint arXiv:1909.08053},
  year={2019}
}
@inproceedings{Leviathan2022FastIF,
  title={{Fast Inference from Transformers via Speculative Decoding}},
  author={Yaniv Leviathan and Matan Kalman and Y. Matias},
  booktitle={{International Conference on Machine Learning}},
  year={2022},
  url={https://api.semanticscholar.org/CorpusID:254096365}
}
@article{agarwal2020accordion,
  title={{Accordion: Adaptive gradient communication via critical learning regime identification}},
  author={Agarwal, Saurabh and Wang, Hongyi and Lee, Kangwook and Venkataraman, Shivaram and Papailiopoulos, Dimitris},
  journal={arXiv preprint arXiv:2010.16248},
  year={2020}
}
@misc{zhao2016tensor,
      title={{Tensor Ring Decomposition}}, 
      author={Qibin Zhao and Guoxu Zhou and Shengli Xie and Liqing Zhang and Andrzej Cichocki},
      year={2016},
      eprint={1606.05535},
      archivePrefix={arXiv},
      primaryClass={cs.NA}
}

@article{tensor_decomp,
author = {Oseledets, I. V.},
title = {Tensor-Train Decomposition},
journal = {SIAM Journal on Scientific Computing},
volume = {33},
number = {5},
pages = {2295-2317},
year = {2011},
doi = {10.1137/090752286},

URL = { 
    
        https://doi.org/10.1137/090752286
    
    
},
eprint = { 
    
        https://doi.org/10.1137/090752286
}
}

@inproceedings{espresso,
author = {Wang, Zhuang and Lin, Haibin and Zhu, Yibo and Ng, T. S. Eugene},
title = {Hi-Speed DNN Training with Espresso: Unleashing the Full Potential of Gradient Compression with Near-Optimal Usage Strategies},
year = {2023},
isbn = {9781450394871},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3552326.3567505},
doi = {10.1145/3552326.3567505},
abstract = {Gradient compression (GC) is a promising approach to addressing the communication bottleneck in distributed deep learning (DDL). It saves the communication time, but also incurs additional computation overheads. The training throughput of compression-enabled DDL is determined by the compression strategy, including whether to compress each tensor, the type of compute resources (e.g., CPUs or GPUs) for compression, the communication schemes for compressed tensor, and so on. However, it is challenging to find the optimal compression strategy for applying GC to DDL because of the intricate interactions among tensors. To fully unleash the benefits of GC, two questions must be addressed: 1) How to express any compression strategies and the corresponding interactions among tensors of any DDL training job? 2) How to quickly select a near-optimal compression strategy?In this paper, we propose Espresso to answer these questions. It first designs a decision tree abstraction to express any compression strategies and develops empirical models to timeline tensor computation, communication, and compression to enable Espresso to derive the intricate interactions among tensors. It then designs a compression decision algorithm that analyzes tensor interactions to eliminate and prioritize strategies and optimally offloads compression from GPUs to CPUs. Experimental evaluations show that Espresso can improve the training throughput over the start-of-the-art compression-enabled system by up to 77\% for representative DDL training jobs. Moreover, the computational time needed to select the compression strategy is measured in milliseconds, and the selected strategy is only a few percent from optimal.},
booktitle = {Proceedings of the Eighteenth European Conference on Computer Systems},
pages = {867–882},
numpages = {16},
keywords = {systems for machine learning, distributed systems, gradient compression, DNN training},
location = {Rome, Italy},
series = {EuroSys '23}
}


@inproceedings{egeria,
author = {Wang, Yiding and Sun, Decang and Chen, Kai and Lai, Fan and Chowdhury, Mosharaf},
title = {Egeria: Efficient DNN Training with Knowledge-Guided Layer Freezing},
year = {2023},
isbn = {9781450394871},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3552326.3587451},
doi = {10.1145/3552326.3587451},
abstract = {Training deep neural networks (DNNs) is time-consuming. While most existing solutions try to overlap/schedule computation and communication for efficient training, this paper goes one step further by skipping computing and communication through DNN layer freezing. Our key insight is that the training progress of internal DNN layers differs significantly, and front layers often become well-trained much earlier than deep layers. To explore this, we first introduce the notion of training plasticity to quantify the training progress of internal DNN layers. Then we design Egeria, a knowledge-guided DNN training system that employs semantic knowledge from a reference model to accurately evaluate individual layers' training plasticity and safely freeze the converged ones, saving their corresponding backward computation and communication. Our reference model is generated on the fly using quantization techniques and runs forward operations asynchronously on available CPUs to minimize the overhead. In addition, Egeria caches the intermediate outputs of the frozen layers with prefetching to further skip the forward computation. Our implementation and testbed experiments with popular vision and language models show that Egeria achieves 19\%-43\% training speedup w.r.t. the state-of-the-art without sacrificing accuracy.},
booktitle = {Proceedings of the Eighteenth European Conference on Computer Systems},
pages = {851–866},
numpages = {16},
keywords = {layer freezing, machine learning training},
location = {Rome, Italy},
series = {EuroSys '23}
}


@article{rissanen1981efficient,
  title={{Efficient arithmetic coding for data compression}},
  author={Rissanen, Jorma and Langdon, G. G. Jr},
  journal={IEEE transactions on Communications},
  volume={29},
  number={6},
  pages={858--865},
  year={1981},
  publisher={IEEE}
}
@article{rissanen1976generalized,
  title={{Generalized Kraft inequality and arithmetic coding}},
  author={Rissanen, Jorma},
  journal={IBM Journal of Research and Development},
  volume={20},
  number={3},
  pages={198--203},
  year={1976},
  publisher={IBM}
}

@article{rissanen1979arithmetic,
  title={{Arithmetic coding}},
  author={Rissanen, Jorma},
  journal={IBM Journal of Research and Development},
  volume={23},
  number={2},
  pages={149--162},
  year={1979},
  publisher={IBM}
}


@article{ac,
author = {Witten, Ian H. and Neal, Radford M. and Cleary, John G.},
title = {Arithmetic Coding for Data Compression},
year = {1987},
issue_date = {June 1987},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {30},
number = {6},
issn = {0001-0782},
url = {https://doi.org/10.1145/214762.214771},
doi = {10.1145/214762.214771},
abstract = {The state of the art in data compression is arithmetic coding, not the better-known Huffman method. Arithmetic coding gives greater compression, is faster for adaptive models, and clearly separates the model from the channel encoding.},
journal = {Commun. ACM},
month = {jun},
pages = {520–540},
numpages = {21}
}


@inproceedings{smoothquant,
  title={{Smoothquant: Accurate and efficient post-training quantization for large language models}},
  author={Xiao, Guangxuan and Lin, Ji and Seznec, Mickael and Wu, Hao and Demouth, Julien and Han, Song},
  booktitle={{International Conference on Machine Learning}},
  pages={38087--38099},
  year={2023},
  organization={PMLR}
}

@article{flexgen,
  title={{High-throughput generative inference of large language models with a single gpu}},
  author={Sheng, Ying and Zheng, Lianmin and Yuan, Binhang and Li, Zhuohan and Ryabinin, Max and Fu, Daniel Y and Xie, Zhiqiang and Chen, Beidi and Barrett, Clark and Gonzalez, Joseph E and others},
  journal={arXiv preprint arXiv:2303.06865},
  year={2023}
}


@article{llmint8,
  title={{Llm. int8 (): 8-bit matrix multiplication for transformers at scale}},
  author={Dettmers, Tim and Lewis, Mike and Belkada, Younes and Zettlemoyer, Luke},
  journal={arXiv preprint arXiv:2208.07339},
  year={2022}
}


@misc{vaswani2023attention,
      title={{Attention Is All You Need}}, 
      author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
      year={2023},
      eprint={1706.03762},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@article{pope2023efficiently,
  title={{Efficiently scaling transformer inference}},
  author={Pope, Reiner and Douglas, Sholto and Chowdhery, Aakanksha and Devlin, Jacob and Bradbury, James and Heek, Jonathan and Xiao, Kefan and Agrawal, Shivani and Dean, Jeff},
  journal={Proceedings of Machine Learning and Systems},
  volume={5},
  year={2023}
}
@article{ott2019fairseq,
  title={{fairseq: A fast, extensible toolkit for sequence modeling}},
  author={Ott, Myle and Edunov, Sergey and Baevski, Alexei and Fan, Angela and Gross, Sam and Ng, Nathan and Grangier, David and Auli, Michael},
  journal={arXiv preprint arXiv:1904.01038},
  year={2019}
}
@inproceedings{wolf-etal-2020-transformers,
    title = "Transformers: State-of-the-Art Natural Language Processing",
    author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
    month = oct,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
    pages = "38--45"
}
@article{dai2019transformer,
  title={{Transformer-xl: Attentive language models beyond a fixed-length context}},
  author={Dai, Zihang and Yang, Zhilin and Yang, Yiming and Carbonell, Jaime and Le, Quoc V and Salakhutdinov, Ruslan},
  journal={arXiv preprint arXiv:1901.02860},
  year={2019}
}

@article{shuster2021retrieval,
  title={{Retrieval augmentation reduces hallucination in conversation}},
  author={Shuster, Kurt and Poff, Spencer and Chen, Moya and Kiela, Douwe and Weston, Jason},
  journal={arXiv preprint arXiv:2104.07567},
  year={2021}
}
@article{milbauer2023lait,
  title={{LAIT: Efficient Multi-Segment Encoding in Transformers with Layer-Adjustable Interaction}},
  author={Milbauer, Jeremiah and Louis, Annie and Hosseini, Mohammad Javad and Fabrikant, Alex and Metzler, Donald and Schuster, Tal},
  journal={arXiv preprint arXiv:2305.19585},
  year={2023}
}
@article{komeili2021internet,
  title={{Internet-augmented dialogue generation}},
  author={Komeili, Mojtaba and Shuster, Kurt and Weston, Jason},
  journal={arXiv preprint arXiv:2107.07566},
  year={2021}
}
@article{mialon2023augmented,
  title={{Augmented language models: a survey}},
  author={Mialon, Gr{\'e}goire and Dess{\`\i}, Roberto and Lomeli, Maria and Nalmpantis, Christoforos and Pasunuru, Ram and Raileanu, Roberta and Rozi{\`e}re, Baptiste and Schick, Timo and Dwivedi-Yu, Jane and Celikyilmaz, Asli and others},
  journal={arXiv preprint arXiv:2302.07842},
  year={2023}
}

@article{izacard2020leveraging,
  title={{Leveraging passage retrieval with generative models for open domain question answering}},
  author={Izacard, Gautier and Grave, Edouard},
  journal={arXiv preprint arXiv:2007.01282},
  year={2020}
}

@article{li2022decoupled,
  title={{Decoupled context processing for context augmented language modeling}},
  author={Li, Zonglin and Guo, Ruiqi and Kumar, Sanjiv},
  journal={arXiv preprint arXiv:2210.05758},
  year={2022}
}
@article{izacard2020leveraging,
  title={{Leveraging passage retrieval with generative models for open domain question answering}},
  author={Izacard, Gautier and Grave, Edouard},
  journal={arXiv preprint arXiv:2007.01282},
  year={2020}
}
@article{mialon2023augmented,
  title={{Augmented language models: a survey}},
  author={Mialon, Gr{\'e}goire and Dess{\`\i}, Roberto and Lomeli, Maria and Nalmpantis, Christoforos and Pasunuru, Ram and Raileanu, Roberta and Rozi{\`e}re, Baptiste and Schick, Timo and Dwivedi-Yu, Jane and Celikyilmaz, Asli and others},
  journal={arXiv preprint arXiv:2302.07842},
  year={2023}
}
@article{komeili2021internet,
  title={{Internet-augmented dialogue generation}},
  author={Komeili, Mojtaba and Shuster, Kurt and Weston, Jason},
  journal={arXiv preprint arXiv:2107.07566},
  year={2021}
}

@article{shuster2021retrieval,
  title={{Retrieval augmentation reduces hallucination in conversation}},
  author={Shuster, Kurt and Poff, Spencer and Chen, Moya and Kiela, Douwe and Weston, Jason},
  journal={arXiv preprint arXiv:2104.07567},
  year={2021}
}
@article{milbauer2023lait,
  title={{LAIT: Efficient Multi-Segment Encoding in Transformers with Layer-Adjustable Interaction}},
  author={Milbauer, Jeremiah and Louis, Annie and Hosseini, Mohammad Javad and Fabrikant, Alex and Metzler, Donald and Schuster, Tal},
  journal={arXiv preprint arXiv:2305.19585},
  year={2023}
}


@misc{kim2023stack,
      title={{Full Stack Optimization of Transformer Inference: a Survey}}, 
      author={Sehoon Kim and Coleman Hooper and Thanakul Wattanawong and Minwoo Kang and Ruohan Yan and Hasan Genc and Grace Dinh and Qijing Huang and Kurt Keutzer and Michael W. Mahoney and Yakun Sophia Shao and Amir Gholami},
      year={2023},
      eprint={2302.14017},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{longchat2023,
    title = {How Long Can Open-Source LLMs Truly Promise on Context Length?},
    url = {https://lmsys.org/blog/2023-06-29-longchat},
    author = {Dacheng Li* and Rulin Shao* and Anze Xie and Ying Sheng, Lianmin Zheng and Joseph E. Gonzalez and Ion Stoica and Xuezhe Ma and Hao Zhang},
    month = {June},
    year = {2023}
}
@misc{sun2021longrange,
      title={{Do Long-Range Language Models Actually Use Long-Range Context?}}, 
      author={Simeng Sun and Kalpesh Krishna and Andrew Mattarella-Micke and Mohit Iyyer},
      year={2021},
      eprint={2109.09115},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{liu2023lost,
      title={{Lost in the Middle: How Language Models Use Long Contexts}}, 
      author={Nelson F. Liu and Kevin Lin and John Hewitt and Ashwin Paranjape and Michele Bevilacqua and Fabio Petroni and Percy Liang},
      year={2023},
      eprint={2307.03172},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@misc{johnson2017billionscale,
      title={{Billion-scale similarity search with GPUs}}, 
      author={Jeff Johnson and Matthijs Douze and Hervé Jégou},
      year={2017},
      eprint={1702.08734},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{2019t5,
    author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
    title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
    journal = {arXiv e-prints},
    year = {2019},
    archivePrefix = {arXiv},
    eprint = {1910.10683},
}
@inproceedings{
zhang2023ho,
title={{H2O: Heavy-Hitter Oracle for Efficient Generative Inference of Large Language Models}},
author={Zhenyu Zhang and Ying Sheng and Tianyi Zhou and Tianlong Chen and Lianmin Zheng and Ruisi Cai and Zhao Song and Yuandong Tian and Christopher Re and Clark Barrett and Zhangyang Wang and Beidi Chen},
booktitle={{Workshop on Efficient Systems for Foundation Models @ ICML2023}},
year={2023},
url={https://openreview.net/forum?id=ctPizehA9D}
}
@misc{dejong2023fido,
      title={{FiDO: Fusion-in-Decoder optimized for stronger performance and faster inference}}, 
      author={Michiel de Jong and Yury Zemlyanskiy and Joshua Ainslie and Nicholas FitzGerald and Sumit Sanghai and Fei Sha and William Cohen},
      year={2023},
      eprint={2212.08153},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@misc{mohtashami2023landmark,
      title={{Landmark Attention: Random-Access Infinite Context Length for Transformers}}, 
      author={Amirkeivan Mohtashami and Martin Jaggi},
      year={2023},
      eprint={2305.16300},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@misc{
borzunov2023distributed,
title={{Distributed Inference and Fine-tuning of Large Language Models Over The Internet}},
author={Alexander Borzunov and Dmitry Baranchuk and Tim Dettmers and Max Ryabinin and Younes Belkada and Artem Chumachenko and Pavel Samygin and Colin Raffel},
year={2023},
url={https://openreview.net/forum?id=HLQyRgRnoXo}
}
@misc{ding2023selfagreement,
      title={{Self-Agreement: A Framework for Fine-tuning Language Models to Find Agreement among Diverse Opinions}}, 
      author={Shiyao Ding and Takayuki Ito},
      year={2023},
      eprint={2305.11460},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@article{lin2023speciality,
  title={{Speciality vs Generality: An Empirical Study on Catastrophic Forgetting in Fine-tuning Foundation Models}},
  author={Lin, Yong and Tan, Lu and Lin, Hangyu and Zheng, Zeming and Pi, Renjie and Zhang, Jipeng and Diao, Shizhe and Wang, Haoxiang and Zhao, Han and Yao, Yuan and others},
  journal={arXiv preprint arXiv:2309.06256},
  year={2023}
}

@article{47761,
title	= {Natural Questions: a Benchmark for Question Answering Research},
author	= {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},
year	= {2019},
journal	= {Transactions of the Association of Computational Linguistics}
}


@misc{agostinelli2023musiclm,
      title={{MusicLM: Generating Music From Text}}, 
      author={Andrea Agostinelli and Timo I. Denk and Zalán Borsos and Jesse Engel and Mauro Verzetti and Antoine Caillon and Qingqing Huang and Aren Jansen and Adam Roberts and Marco Tagliasacchi and Matt Sharifi and Neil Zeghidour and Christian Frank},
      year={2023},
      eprint={2301.11325},
      archivePrefix={arXiv},
      primaryClass={cs.SD}
}
@misc{chen2023extending,
      title={{Extending Context Window of Large Language Models via Positional Interpolation}}, 
      author={Shouyuan Chen and Sherman Wong and Liangjian Chen and Yuandong Tian},
      year={2023},
      eprint={2306.15595},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@misc{kitaev2020reformer,
      title={{Reformer: The Efficient Transformer}}, 
      author={Nikita Kitaev and Łukasz Kaiser and Anselm Levskaya},
      year={2020},
      eprint={2001.04451},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}
@misc{luo2021stable,
      title={{Stable, Fast and Accurate: Kernelized Attention with Relative Positional Encoding}}, 
      author={Shengjie Luo and Shanda Li and Tianle Cai and Di He and Dinglan Peng and Shuxin Zheng and Guolin Ke and Liwei Wang and Tie-Yan Liu},
      year={2021},
      eprint={2106.12566},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}
@misc{workshop2023bloom,
      title={{BLOOM: A 176B-Parameter Open-Access Multilingual Language Model}}, 
      author={BigScience Workshop and : and Teven Le Scao and Angela Fan and Christopher Akiki and Ellie Pavlick and Suzana Ilić and Daniel Hesslow and Roman Castagné and Alexandra Sasha Luccioni and François Yvon and Matthias Gallé and Jonathan Tow and Alexander M. Rush and Stella Biderman and Albert Webson and Pawan Sasanka Ammanamanchi and Thomas Wang and Benoît Sagot and Niklas Muennighoff and Albert Villanova del Moral and Olatunji Ruwase and Rachel Bawden and Stas Bekman and Angelina McMillan-Major and Iz Beltagy and Huu Nguyen and Lucile Saulnier and Samson Tan and Pedro Ortiz Suarez and Victor Sanh and Hugo Laurençon and Yacine Jernite and Julien Launay and Margaret Mitchell and Colin Raffel and Aaron Gokaslan and Adi Simhi and Aitor Soroa and Alham Fikri Aji and Amit Alfassy and Anna Rogers and Ariel Kreisberg Nitzav and Canwen Xu and Chenghao Mou and Chris Emezue and Christopher Klamm and Colin Leong and Daniel van Strien and David Ifeoluwa Adelani and Dragomir Radev and Eduardo González Ponferrada and Efrat Levkovizh and Ethan Kim and Eyal Bar Natan and Francesco De Toni and Gérard Dupont and Germán Kruszewski and Giada Pistilli and Hady Elsahar and Hamza Benyamina and Hieu Tran and Ian Yu and Idris Abdulmumin and Isaac Johnson and Itziar Gonzalez-Dios and Javier de la Rosa and Jenny Chim and Jesse Dodge and Jian Zhu and Jonathan Chang and Jörg Frohberg and Joseph Tobing and Joydeep Bhattacharjee and Khalid Almubarak and Kimbo Chen and Kyle Lo and Leandro Von Werra and Leon Weber and Long Phan and Loubna Ben allal and Ludovic Tanguy and Manan Dey and Manuel Romero Muñoz and Maraim Masoud and María Grandury and Mario Šaško and Max Huang and Maximin Coavoux and Mayank Singh and Mike Tian-Jian Jiang and Minh Chien Vu and Mohammad A. Jauhar and Mustafa Ghaleb and Nishant Subramani and Nora Kassner and Nurulaqilla Khamis and Olivier Nguyen and Omar Espejel and Ona de Gibert and Paulo Villegas and Peter Henderson and Pierre Colombo and Priscilla Amuok and Quentin Lhoest and Rheza Harliman and Rishi Bommasani and Roberto Luis López and Rui Ribeiro and Salomey Osei and Sampo Pyysalo and Sebastian Nagel and Shamik Bose and Shamsuddeen Hassan Muhammad and Shanya Sharma and Shayne Longpre and Somaieh Nikpoor and Stanislav Silberberg and Suhas Pai and Sydney Zink and Tiago Timponi Torrent and Timo Schick and Tristan Thrush and Valentin Danchev and Vassilina Nikoulina and Veronika Laippala and Violette Lepercq and Vrinda Prabhu and Zaid Alyafeai and Zeerak Talat and Arun Raja and Benjamin Heinzerling and Chenglei Si and Davut Emre Taşar and Elizabeth Salesky and Sabrina J. Mielke and Wilson Y. Lee and Abheesht Sharma and Andrea Santilli and Antoine Chaffin and Arnaud Stiegler and Debajyoti Datta and Eliza Szczechla and Gunjan Chhablani and Han Wang and Harshit Pandey and Hendrik Strobelt and Jason Alan Fries and Jos Rozen and Leo Gao and Lintang Sutawika and M Saiful Bari and Maged S. Al-shaibani and Matteo Manica and Nihal Nayak and Ryan Teehan and Samuel Albanie and Sheng Shen and Srulik Ben-David and Stephen H. Bach and Taewoon Kim and Tali Bers and Thibault Fevry and Trishala Neeraj and Urmish Thakker and Vikas Raunak and Xiangru Tang and Zheng-Xin Yong and Zhiqing Sun and Shaked Brody and Yallow Uri and Hadar Tojarieh and Adam Roberts and Hyung Won Chung and Jaesung Tae and Jason Phang and Ofir Press and Conglong Li and Deepak Narayanan and Hatim Bourfoune and Jared Casper and Jeff Rasley and Max Ryabinin and Mayank Mishra and Minjia Zhang and Mohammad Shoeybi and Myriam Peyrounette and Nicolas Patry and Nouamane Tazi and Omar Sanseviero and Patrick von Platen and Pierre Cornette and Pierre François Lavallée and Rémi Lacroix and Samyam Rajbhandari and Sanchit Gandhi and Shaden Smith and Stéphane Requena and Suraj Patil and Tim Dettmers and Ahmed Baruwa and Amanpreet Singh and Anastasia Cheveleva and Anne-Laure Ligozat and Arjun Subramonian and Aurélie Névéol and Charles Lovering and Dan Garrette and Deepak Tunuguntla and Ehud Reiter and Ekaterina Taktasheva and Ekaterina Voloshina and Eli Bogdanov and Genta Indra Winata and Hailey Schoelkopf and Jan-Christoph Kalo and Jekaterina Novikova and Jessica Zosa Forde and Jordan Clive and Jungo Kasai and Ken Kawamura and Liam Hazan and Marine Carpuat and Miruna Clinciu and Najoung Kim and Newton Cheng and Oleg Serikov and Omer Antverg and Oskar van der Wal and Rui Zhang and Ruochen Zhang and Sebastian Gehrmann and Shachar Mirkin and Shani Pais and Tatiana Shavrina and Thomas Scialom and Tian Yun and Tomasz Limisiewicz and Verena Rieser and Vitaly Protasov and Vladislav Mikhailov and Yada Pruksachatkun and Yonatan Belinkov and Zachary Bamberger and Zdeněk Kasner and Alice Rueda and Amanda Pestana and Amir Feizpour and Ammar Khan and Amy Faranak and Ana Santos and Anthony Hevia and Antigona Unldreaj and Arash Aghagol and Arezoo Abdollahi and Aycha Tammour and Azadeh HajiHosseini and Bahareh Behroozi and Benjamin Ajibade and Bharat Saxena and Carlos Muñoz Ferrandis and Daniel McDuff and Danish Contractor and David Lansky and Davis David and Douwe Kiela and Duong A. Nguyen and Edward Tan and Emi Baylor and Ezinwanne Ozoani and Fatima Mirza and Frankline Ononiwu and Habib Rezanejad and Hessie Jones and Indrani Bhattacharya and Irene Solaiman and Irina Sedenko and Isar Nejadgholi and Jesse Passmore and Josh Seltzer and Julio Bonis Sanz and Livia Dutra and Mairon Samagaio and Maraim Elbadri and Margot Mieskes and Marissa Gerchick and Martha Akinlolu and Michael McKenna and Mike Qiu and Muhammed Ghauri and Mykola Burynok and Nafis Abrar and Nazneen Rajani and Nour Elkott and Nour Fahmy and Olanrewaju Samuel and Ran An and Rasmus Kromann and Ryan Hao and Samira Alizadeh and Sarmad Shubber and Silas Wang and Sourav Roy and Sylvain Viguier and Thanh Le and Tobi Oyebade and Trieu Le and Yoyo Yang and Zach Nguyen and Abhinav Ramesh Kashyap and Alfredo Palasciano and Alison Callahan and Anima Shukla and Antonio Miranda-Escalada and Ayush Singh and Benjamin Beilharz and Bo Wang and Caio Brito and Chenxi Zhou and Chirag Jain and Chuxin Xu and Clémentine Fourrier and Daniel León Periñán and Daniel Molano and Dian Yu and Enrique Manjavacas and Fabio Barth and Florian Fuhrimann and Gabriel Altay and Giyaseddin Bayrak and Gully Burns and Helena U. Vrabec and Imane Bello and Ishani Dash and Jihyun Kang and John Giorgi and Jonas Golde and Jose David Posada and Karthik Rangasai Sivaraman and Lokesh Bulchandani and Lu Liu and Luisa Shinzato and Madeleine Hahn de Bykhovetz and Maiko Takeuchi and Marc Pàmies and Maria A Castillo and Marianna Nezhurina and Mario Sänger and Matthias Samwald and Michael Cullan and Michael Weinberg and Michiel De Wolf and Mina Mihaljcic and Minna Liu and Moritz Freidank and Myungsun Kang and Natasha Seelam and Nathan Dahlberg and Nicholas Michio Broad and Nikolaus Muellner and Pascale Fung and Patrick Haller and Ramya Chandrasekhar and Renata Eisenberg and Robert Martin and Rodrigo Canalli and Rosaline Su and Ruisi Su and Samuel Cahyawijaya and Samuele Garda and Shlok S Deshmukh and Shubhanshu Mishra and Sid Kiblawi and Simon Ott and Sinee Sang-aroonsiri and Srishti Kumar and Stefan Schweter and Sushil Bharati and Tanmay Laud and Théo Gigant and Tomoya Kainuma and Wojciech Kusa and Yanis Labrak and Yash Shailesh Bajaj and Yash Venkatraman and Yifan Xu and Yingxin Xu and Yu Xu and Zhe Tan and Zhongli Xie and Zifan Ye and Mathilde Bras and Younes Belkada and Thomas Wolf},
      year={2023},
      eprint={2211.05100},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@misc{raffel2023exploring,
      title={{Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer}}, 
      author={Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
      year={2023},
      eprint={1910.10683},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}
@misc{touvron2023llama,
      title={{LLaMA: Open and Efficient Foundation Language Models}}, 
      author={Hugo Touvron and Thibaut Lavril and Gautier Izacard and Xavier Martinet and Marie-Anne Lachaux and Timothée Lacroix and Baptiste Rozière and Naman Goyal and Eric Hambro and Faisal Azhar and Aurelien Rodriguez and Armand Joulin and Edouard Grave and Guillaume Lample},
      year={2023},
      eprint={2302.13971},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@misc{zhang2022opt,
      title={{OPT: Open Pre-trained Transformer Language Models}}, 
      author={Susan Zhang and Stephen Roller and Naman Goyal and Mikel Artetxe and Moya Chen and Shuohui Chen and Christopher Dewan and Mona Diab and Xian Li and Xi Victoria Lin and Todor Mihaylov and Myle Ott and Sam Shleifer and Kurt Shuster and Daniel Simig and Punit Singh Koura and Anjali Sridhar and Tianlu Wang and Luke Zettlemoyer},
      year={2022},
      eprint={2205.01068},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@inproceedings{zhai2022scaling,
  title={{Scaling vision transformers}},
  author={Zhai, Xiaohua and Kolesnikov, Alexander and Houlsby, Neil and Beyer, Lucas},
  booktitle={{Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}},
  pages={12104--12113},
  year={2022}
}
@article{kasneci2023chatgpt,
  title={{ChatGPT for good? On opportunities and challenges of large language models for education}},
  author={Kasneci, Enkelejda and Se{\ss}ler, Kathrin and K{\"u}chemann, Stefan and Bannert, Maria and Dementieva, Daryna and Fischer, Frank and Gasser, Urs and Groh, Georg and G{\"u}nnemann, Stephan and H{\"u}llermeier, Eyke and others},
  journal={Learning and individual differences},
  volume={103},
  pages={102274},
  year={2023},
  publisher={Elsevier}
}


@misc{google_bard,
  title = {Bard - Chat Based AI Tool from Google, Powered by PaLM 2},
  howpublished = {\url{https://bard.google.com/}},
  note = {Accessed: September 21st, 2023}
}

@misc{microsoft_bing,
  title = {Bing},
  author = {Microsoft Corporation},
  year = {2009}, 
  howpublished = {\url{https://www.bing.com/}},
}

@misc{bisk2019piqa,
      title={{PIQA: Reasoning about Physical Commonsense in Natural Language}}, 
      author={Yonatan Bisk and Rowan Zellers and Ronan Le Bras and Jianfeng Gao and Yejin Choi},
      year={2019},
      eprint={1911.11641},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@misc{adiwardana2020humanlike,
      title={{Towards a Human-like Open-Domain Chatbot}}, 
      author={Daniel Adiwardana and Minh-Thang Luong and David R. So and Jamie Hall and Noah Fiedel and Romal Thoppilan and Zi Yang and Apoorv Kulshreshtha and Gaurav Nemade and Yifeng Lu and Quoc V. Le},
      year={2020},
      eprint={2001.09977},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@misc{openai2023gpt4,
      title={{GPT-4 Technical Report}}, 
      author={OpenAI},
      year={2023},
      eprint={2303.08774},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@misc{wu2023fast,
      title={{Fast Distributed Inference Serving for Large Language Models}}, 
      author={Bingyang Wu and Yinmin Zhong and Zili Zhang and Gang Huang and Xuanzhe Liu and Xin Jin},
      year={2023},
      eprint={2305.05920},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}
@misc{pope2022efficiently,
      title={{Efficiently Scaling Transformer Inference}}, 
      author={Reiner Pope and Sholto Douglas and Aakanksha Chowdhery and Jacob Devlin and James Bradbury and Anselm Levskaya and Jonathan Heek and Kefan Xiao and Shivani Agrawal and Jeff Dean},
      year={2022},
      eprint={2211.05102},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{dao2022flashattention,
      title={{FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness}}, 
      author={Tri Dao and Daniel Y. Fu and Stefano Ermon and Atri Rudra and Christopher Ré},
      year={2022},
      eprint={2205.14135},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}
@misc{narayanan2023cheaply,
      title={{Cheaply Evaluating Inference Efficiency Metrics for Autoregressive Transformer APIs}}, 
      author={Deepak Narayanan and Keshav Santhanam and Peter Henderson and Rishi Bommasani and Tony Lee and Percy Liang},
      year={2023},
      eprint={2305.02440},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{FasterTransformer,
  author = {NVIDIA},
  title = {FasterTransformer},
  year = {2019},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/NVIDIA/FasterTransformer.git}},
  commit = {c6e8f60}
}


@misc{anagnostidis2023dynamic,
      title={{Dynamic Context Pruning for Efficient and Interpretable Autoregressive Transformers}}, 
      author={Sotiris Anagnostidis and Dario Pavllo and Luca Biggio and Lorenzo Noci and Aurelien Lucchi and Thomas Hofmann},
      year={2023},
      eprint={2305.15805},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@inproceedings{shepherd,
  title={$\{$SHEPHERD$\}$: Serving $\{$DNNs$\}$ in the Wild},
  author={Zhang, Hong and Tang, Yupeng and Khandelwal, Anurag and Stoica, Ion},
  booktitle={{20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)}},
  pages={787--808},
  year={2023}
}

@inproceedings{clockwork,
  title={Serving $\{$DNNs$\}$ like clockwork: Performance predictability from the bottom up},
  author={Gujarati, Arpan and Karimi, Reza and Alzayat, Safya and Hao, Wei and Kaufmann, Antoine and Vigfusson, Ymir and Mace, Jonathan},
  booktitle={{14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)}},
  pages={443--462},
  year={2020}
}

@inproceedings{orca,
  title={Orca: A distributed serving system for $\{$Transformer-Based$\}$ generative models},
  author={Yu, Gyeong-In and Jeong, Joo Seong and Kim, Geon-Woo and Kim, Soojeong and Chun, Byung-Gon},
  booktitle={{16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)}},
  pages={521--538},
  year={2022}
}

@article{efficiently-scaling,
  title={{Efficiently scaling transformer inference}},
  author={Pope, Reiner and Douglas, Sholto and Chowdhery, Aakanksha and Devlin, Jacob and Bradbury, James and Heek, Jonathan and Xiao, Kefan and Agrawal, Shivani and Dean, Jeff},
  journal={Proceedings of Machine Learning and Systems},
  volume={5},
  year={2023}
}

@misc{llm-app-1,
author = {},
title = {Applications of Large Language Models - InData Labs},
howpublished = {\url{https://indatalabs.com/blog/large-language-model-apps}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}

@misc{llm-app-2,
author = {},
title = {12 Practical Large Language Model (LLM) Applications - Techopedia},
howpublished = {\url{https://www.techopedia.com/12-practical-large-language-model-llm-applications}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}

@misc{llm-app-3,
author = {},
title = {7 Top Large Language Model Use Cases And Applications},
howpublished = {\url{https://www.projectpro.io/article/large-language-model-use-cases-and-applications/887}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}

@misc{llm-app-4,
author = {},
title = {Real-World Use Cases for Large Language Models (LLMs) | by CellStrat | Medium},
howpublished = {\url{https://cellstrat.medium.com/real-world-use-cases-for-large-language-models-llms-d71c3a577bf2}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}

@misc{gpt4-api,
author = {},
title = {GPT-4 API general availability and deprecation of older models in the Completions API},
howpublished = {\url{https://openai.com/blog/gpt-4-api-general-availability}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}

@misc{langchain,
author = {},
title = {langchain-ai/langchain:Building applications with LLMs through composability},
howpublished = {\url{https://github.com/langchain-ai/langchain}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}

@misc{llama,
author = {},
title = {[2302.13971] LLaMA: Open and Efficient Foundation Language Models},
howpublished = {\url{https://arxiv.org/abs/2302.13971}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}

@misc{long-context,
author = {},
title = {Anthropic \textbackslash{} Introducing 100K Context Windows},
howpublished = {\url{https://www.anthropic.com/index/100k-context-windows}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}

@article{arxiv-1,
  title={{Lost in the middle: How language models use long contexts}},
  author={Liu, Nelson F and Lin, Kevin and Hewitt, John and Paranjape, Ashwin and Bevilacqua, Michele and Petroni, Fabio and Liang, Percy},
  journal={arXiv preprint arXiv:2307.03172},
  year={2023}
}

@article{arxiv-2,
  title={{Do long-range language models actually use long-range context?}},
  author={Sun, Simeng and Krishna, Kalpesh and Mattarella-Micke, Andrew and Iyyer, Mohit},
  journal={arXiv preprint arXiv:2109.09115},
  year={2021}
}

@misc{langchain-retrival,
author = {},
title = {Store and reference chat history | Langchain},
howpublished = {\url{https://python.langchain.com/docs/use_cases/question_answering/how_to/chat_vector_db}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}

@misc{arxiv-information-retrieval,
author = {},
title = {2112.04426.pdf},
howpublished = {\url{https://arxiv.org/pdf/2112.04426.pdf}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}

@misc{generative-agents,
author = {},
title = {[2304.03442] Generative Agents: Interactive Simulacra of Human Behavior},
howpublished = {\url{https://arxiv.org/abs/2304.03442}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}

@misc{autogpt,
author = {},
title = {Significant-Gravitas/Auto-GPT: An experimental open-source attempt to make GPT-4 fully autonomous.},
howpublished = {\url{https://github.com/Significant-Gravitas/Auto-GPT}},
month = {},
year = {},
note = {(Accessed on 09/21/2023)}
}

@article{scissorhand,
  title={{Scissorhands: Exploiting the Persistence of Importance Hypothesis for LLM KV Cache Compression at Test Time}},
  author={Liu, Zichang and Desai, Aditya and Liao, Fangshuo and Wang, Weitao and Xie, Victor and Xu, Zhaozhuo and Kyrillidis, Anastasios and Shrivastava, Anshumali},
  journal={arXiv preprint arXiv:2305.17118},
  year={2023}
}

@article{gisting,
  title={{Learning to compress prompts with gist tokens}},
  author={Mu, Jesse and Li, Xiang Lisa and Goodman, Noah},
  journal={arXiv preprint arXiv:2304.08467},
  year={2023}
}

@article{autoencoder-incontext,
  title={{In-context Autoencoder for Context Compression in a Large Language Model}},
  author={Ge, Tao and Hu, Jing and Wang, Xun and Chen, Si-Qing and Wei, Furu},
  journal={arXiv preprint arXiv:2307.06945},
  year={2023}
}

@article{palm,
  title={{Palm: Scaling language modeling with pathways}},
  author={Chowdhery, Aakanksha and Narang, Sharan and Devlin, Jacob and Bosma, Maarten and Mishra, Gaurav and Roberts, Adam and Barham, Paul and Chung, Hyung Won and Sutton, Charles and Gehrmann, Sebastian and others},
  journal={arXiv preprint arXiv:2204.02311},
  year={2022}
}

@article{long-llama,
  title={{Focused transformer: Contrastive training for context scaling}},
  author={Tworkowski, Szymon and Staniszewski, Konrad and Pacek, Miko{\l}aj and Wu, Yuhuai and Michalewski, Henryk and Mi{\l}o{\'s}, Piotr},
  journal={arXiv preprint arXiv:2307.03170},
  year={2023}
}

@misc{latency_experience1,
author = {},
title = {Best Practices for Deploying Large Language Models (LLMs) in Production},
howpublished = {\url{https://medium.com/@_aigeek/best-practices-for-deploying-large-language-models-llms-in-production-fdc5bf240d6a}},
month = {},
year = {2023},
note = {(Accessed on 09/21/2023)}
}

@misc{latency_experience2,
author = {},
title = {How latency affects user engagement
},
howpublished = {\url{https://pusher.com/blog/how-latency-affects-user-engagement/
}},
month = {},
year = {2021},
note = {(Accessed on 09/21/2023)}
}

@article{latency_experience3,
    author = {Lew, Zijian and Walther, Joseph B and Pang, Augustine and Shin, Wonsun},
    title = "{Interactivity in Online Chat: Conversational Contingency and Response Latency in Computer-mediated Communication}",
    journal = {Journal of Computer-Mediated Communication},
    volume = {23},
    number = {4},
    pages = {201-221},
    year = {2018},
    month = {06},
    abstract = "{In dyadic online chats with customers, agents commonly employ scripted responses and converse with several customers simultaneously in order to enhance efficiency. These techniques, however, can affect dimensions of interactivity—conversational contingency and response latency—undermining interpersonal assessments, satisfaction, and organizations’ relationships with customers. This research incorporates aspects of interactivity to the social information processing (SIP) theory of computer-mediated communication, that addresses conversational behaviors that affect interpersonal relations in the absence of nonverbal cues. In a 2 × 2 between-subjects experiment, observers watched one of four versions of a dialogue between a customer and sales support agent, which differed with respect to the agent’s response latency and conversational contingency. Results confirmed deleterious effects of non-contingency on outcomes. Contingency moderated latency effects. Mediation analyses showed indirect effects of contingency via interpersonal judgments on organization/customer relations. Implications for a more comprehensive approach to SIP conclude the study.}",
    issn = {1083-6101},
    doi = {10.1093/jcmc/zmy009},
    url = {https://doi.org/10.1093/jcmc/zmy009},
    eprint = {https://academic.oup.com/jcmc/article-pdf/23/4/201/25113924/zmy009.pdf},
}

@inproceedings{few_shot_se,
author = {Ahmed, Toufique and Devanbu, Premkumar},
title = {Few-Shot Training LLMs for Project-Specific Code-Summarization},
year = {2023},
isbn = {9781450394758},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3551349.3559555},
doi = {10.1145/3551349.3559555},
abstract = {Very large language models (LLMs), such as GPT-3 and Codex have achieved state-of-the-art performance on several natural-language tasks, and show great promise also for code. A particularly exciting aspect of LLMs is their knack for few-shot and zero-shot learning: they can learn to perform a task with very few examples. Few-shotting has particular synergies in software engineering, where there are a lot of phenomena (identifier names, APIs, terminology, coding patterns) that are known to be highly project-specific. However, project-specific data can be quite limited, especially early in the history of a project; thus the few-shot learning capacity of LLMs might be very relevant. In this paper, we investigate the use few-shot training with the very large GPT (Generative Pre-trained Transformer) Codex model, and find evidence suggesting that one can significantly surpass state-of-the-art models for code-summarization, leveraging project-specific training.},
booktitle = {Proceedings of the 37th IEEE/ACM International Conference on Automated Software Engineering},
articleno = {177},
numpages = {5},
keywords = {large language model, deep learning, code summarization},
location = {Rochester, MI, USA},
series = {ASE '22}
}

@misc{morgan_stanley_private_cloud,
author = {},
title = {How to Train Generative AI Using Your Company’s Data
},
howpublished = {\url{https://hbr.org/2023/07/how-to-train-generative-ai-using-your-companys-data
}},
month = {},
year = {2021},
note = {(Accessed on 09/21/2023)}
}

@inproceedings {eyeq,
author = {Vimalkumar Jeyakumar and Mohammad Alizadeh and David Mazi{\`e}res and Balaji Prabhakar and Albert Greenberg and Changhoon Kim},
title = {{EyeQ}: Practical Network Performance Isolation at the Edge},
booktitle = {10th USENIX Symposium on Networked Systems Design and Implementation (NSDI 13)},
year = {2013},
isbn = {978-1-931971-00-3},
address = {Lombard, IL},
pages = {297--311},
url = {https://www.usenix.org/conference/nsdi13/technical-sessions/presentation/jeyakumar},
publisher = {USENIX Association},
month = apr
}
@article{10.1145/2534169.2486027,
author = {Popa, Lucian and Yalagandula, Praveen and Banerjee, Sujata and Mogul, Jeffrey C. and Turner, Yoshio and Santos, Jose Renato},
title = {ElasticSwitch: Practical Work-Conserving Bandwidth Guarantees for Cloud Computing},
year = {2013},
issue_date = {October 2013},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {43},
number = {4},
issn = {0146-4833},
url = {https://doi.org/10.1145/2534169.2486027},
doi = {10.1145/2534169.2486027},
abstract = {While cloud computing providers offer guaranteed allocations for resources such as CPU and memory, they do not offer any guarantees for network resources. The lack of network guarantees prevents tenants from predicting lower bounds on the performance of their applications. The research community has recognized this limitation but, unfortunately, prior solutions have significant limitations: either they are inefficient, because they are not work-conserving, or they are impractical, because they require expensive switch support or congestion-free network cores.In this paper, we propose ElasticSwitch, an efficient and practical approach for providing bandwidth guarantees. ElasticSwitch is efficient because it utilizes the spare bandwidth from unreserved capacity or underutilized reservations. ElasticSwitch is practical because it can be fully implemented in hypervisors, without requiring a specific topology or any support from switches. Because hypervisors operate mostly independently, there is no need for complex coordination between them or with a central controller. Our experiments, with a prototype implementation on a 100-server testbed, demonstrate that ElasticSwitch provides bandwidth guarantees and is work-conserving, even in challenging situations.},
journal = {SIGCOMM Comput. Commun. Rev.},
month = {aug},
pages = {351–362},
numpages = {12},
keywords = {work-conserving, cloud computing, bandwidth guarantees}
}

@inproceedings{10.1145/2486001.2486027,
author = {Popa, Lucian and Yalagandula, Praveen and Banerjee, Sujata and Mogul, Jeffrey C. and Turner, Yoshio and Santos, Jose Renato},
title = {ElasticSwitch: Practical Work-Conserving Bandwidth Guarantees for Cloud Computing},
year = {2013},
isbn = {9781450320566},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/2486001.2486027},
doi = {10.1145/2486001.2486027},
abstract = {While cloud computing providers offer guaranteed allocations for resources such as CPU and memory, they do not offer any guarantees for network resources. The lack of network guarantees prevents tenants from predicting lower bounds on the performance of their applications. The research community has recognized this limitation but, unfortunately, prior solutions have significant limitations: either they are inefficient, because they are not work-conserving, or they are impractical, because they require expensive switch support or congestion-free network cores.In this paper, we propose ElasticSwitch, an efficient and practical approach for providing bandwidth guarantees. ElasticSwitch is efficient because it utilizes the spare bandwidth from unreserved capacity or underutilized reservations. ElasticSwitch is practical because it can be fully implemented in hypervisors, without requiring a specific topology or any support from switches. Because hypervisors operate mostly independently, there is no need for complex coordination between them or with a central controller. Our experiments, with a prototype implementation on a 100-server testbed, demonstrate that ElasticSwitch provides bandwidth guarantees and is work-conserving, even in challenging situations.},
booktitle = {Proceedings of the ACM SIGCOMM 2013 Conference on SIGCOMM},
pages = {351–362},
numpages = {12},
keywords = {work-conserving, cloud computing, bandwidth guarantees},
location = {Hong Kong, China},
series = {SIGCOMM '13}
}


@article{10.1145/2043164.2018465,
author = {Ballani, Hitesh and Costa, Paolo and Karagiannis, Thomas and Rowstron, Ant},
title = {Towards Predictable Datacenter Networks},
year = {2011},
issue_date = {August 2011},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {41},
number = {4},
issn = {0146-4833},
url = {https://doi.org/10.1145/2043164.2018465},
doi = {10.1145/2043164.2018465},
abstract = {The shared nature of the network in today's multi-tenant datacenters implies that network performance for tenants can vary significantly. This applies to both production datacenters and cloud environments. Network performance variability hurts application performance which makes tenant costs unpredictable and causes provider revenue loss. Motivated by these factors, this paper makes the case for extending the tenant-provider interface to explicitly account for the network. We argue this can be achieved by providing tenants with a virtual network connecting their compute instances. To this effect, the key contribution of this paper is the design of virtual network abstractions that capture the trade-off between the performance guarantees offered to tenants, their costs and the provider revenue.To illustrate the feasibility of virtual networks, we develop Oktopus, a system that implements the proposed abstractions. Using realistic, large-scale simulations and an Oktopus deployment on a 25-node two-tier testbed, we demonstrate that the use of virtual networks yields significantly better and more predictable tenant performance. Further, using a simple pricing model, we find that the our abstractions can reduce tenant costs by up to 74\% while maintaining provider revenue neutrality.},
journal = {SIGCOMM Comput. Commun. Rev.},
month = {aug},
pages = {242–253},
numpages = {12},
keywords = {datacenter, allocation, virtual network, bandwidth}
}

@inproceedings{10.1145/2018436.2018465,
author = {Ballani, Hitesh and Costa, Paolo and Karagiannis, Thomas and Rowstron, Ant},
title = {Towards Predictable Datacenter Networks},
year = {2011},
isbn = {9781450307970},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/2018436.2018465},
doi = {10.1145/2018436.2018465},
abstract = {The shared nature of the network in today's multi-tenant datacenters implies that network performance for tenants can vary significantly. This applies to both production datacenters and cloud environments. Network performance variability hurts application performance which makes tenant costs unpredictable and causes provider revenue loss. Motivated by these factors, this paper makes the case for extending the tenant-provider interface to explicitly account for the network. We argue this can be achieved by providing tenants with a virtual network connecting their compute instances. To this effect, the key contribution of this paper is the design of virtual network abstractions that capture the trade-off between the performance guarantees offered to tenants, their costs and the provider revenue.To illustrate the feasibility of virtual networks, we develop Oktopus, a system that implements the proposed abstractions. Using realistic, large-scale simulations and an Oktopus deployment on a 25-node two-tier testbed, we demonstrate that the use of virtual networks yields significantly better and more predictable tenant performance. Further, using a simple pricing model, we find that the our abstractions can reduce tenant costs by up to 74\% while maintaining provider revenue neutrality.},
booktitle = {Proceedings of the ACM SIGCOMM 2011 Conference},
pages = {242–253},
numpages = {12},
keywords = {allocation, virtual network, bandwidth, datacenter},
location = {Toronto, Ontario, Canada},
series = {SIGCOMM '11}
}