sparsity.bib

@Article{ karl_hierarchical_models,
author = {K.J. Friston},
title = {Hierarchical Models in the Brain},
journal = {PLOS Computational Biology},
year = {2008},
volume = {4},
number = {11},
pages = {e1000211},
doi = {10.1371/journal.pcbi.1000211},
pdf = {/spm/doc/papers/Hierarchical_Models_in_the_Brain.pdf},
keyword = {DEM} 
}

@inproceedings{1991-moody,
  title={Note on generalization, regularization and architecture selection in nonlinear learning systems},
  author={Moody, John E},
  booktitle={Neural Networks for Signal Processing Proceedings of the 1991 IEEE Workshop},
  pages={1--10},
  year={1991},
  organization={IEEE}
}

@inproceedings{1994-hansen,
  title={Controlled growth of cascade correlation nets},
  author={Hansen, Lars Kai and others},
  booktitle={International Conference on Artificial Neural Networks},
  pages={797--800},
  year={1994},
  organization={Springer}
}

@INPROCEEDINGS{2020-qin,
	author={E. {Qin} and A. {Samajdar} and H. {Kwon} and V. {Nadella} and S. {Srinivasan} and D. {Das} and B. {Kaul} and T. {Krishna}},
	booktitle={2020 IEEE International Symposium on High Performance Computer Architecture (HPCA)}, 
	title={SIGMA: A Sparse and Irregular GEMM Accelerator with Flexible Interconnects for DNN Training}, 
	year={2020},
	volume={},
	number={},
	pages={58-70},
	doi={10.1109/HPCA47549.2020.00015}}

@article{lillicrap2020backpropagation,
	title={Backpropagation and the brain},
	author={Lillicrap, Timothy P and Santoro, Adam and Marris, Luke and Akerman, Colin J and Hinton, Geoffrey},
	journal={Nature Reviews Neuroscience},
	pages={1--12},
	year={2020},
	publisher={Nature Publishing Group}
}

@inproceedings{2019-jin,
	author = {Jin, Sian and Di, Sheng and Liang, Xin and Tian, Jiannan and Tao, Dingwen and Cappello, Franck},
	title = {DeepSZ: A Novel Framework to Compress Deep Neural Networks by Using Error-Bounded Lossy Compression},
	year = {2019},
	isbn = {9781450366700},
	publisher = {Association for Computing Machinery},
	address = {New York, NY, USA},
	url = {https://doi.org/10.1145/3307681.3326608},
	doi = {10.1145/3307681.3326608},
	abstract = {Today's deep neural networks (DNNs) are becoming deeper and wider because of increasing demand on the analysis quality and more and more complex applications to resolve. The wide and deep DNNs, however, require large amounts of resources (such as memory, storage, and I/O), significantly restricting their utilization on resource-constrained platforms. Although some DNN simplification methods (such as weight quantization) have been proposed to address this issue, they suffer from either low compression ratios or high compression errors, which may introduce an expensive fine-tuning overhead (i.e., a costly retraining process for the target inference accuracy). In this paper, we propose DeepSZ: an accuracy-loss expected neural network compression framework, which involves four key steps: network pruning, error bound assessment, optimization for error bound configuration, and compressed model generation, featuring a high compression ratio and low encoding time. The contribution is threefold. (1)We develop an adaptive approach to select the feasible error bounds for each layer. (2) We build a model to estimate the overall loss of inference accuracy based on the inference accuracy degradation caused by individual decompressed layers. (3) We develop an efficient optimization algorithm to determine the best-fit configuration of error bounds in order to maximize the compression ratio under the user-set inference accuracy constraint. Experiments show that DeepSZ can compress AlexNet and VGG-16 on the ImageNet dataset by a compression ratio of 46\texttimes{} and 116\texttimes{}, respectively, and compress LeNet-300-100 and LeNet-5 on the MNIST dataset by a compression ratio of 57\texttimes{} and 56\texttimes{}, respectively, with only up to 0.3% loss of inference accuracy. Compared with other state-of-the-art methods, DeepSZ can improve the compression ratio by up to 1.43\texttimes{}, the DNN encoding performance by up to 4.0\texttimes{} with four V100 GPUs, and the decoding performance by up to 6.2\texttimes{}.},
	booktitle = {Proceedings of the 28th International Symposium on High-Performance Parallel and Distributed Computing},
	pages = {159–170},
	numpages = {12},
	keywords = {performance, neural networks, lossy compression, deep learning},
	location = {Phoenix, AZ, USA},
	series = {HPDC '19}
}


@article{2019-zhang-snap,
	title={SNAP: A 1.67  21.55TOPS/W Sparse Neural Acceleration Processor for Unstructured Sparse Deep Neural Network Inference in 16nm CMOS},
	author={Jie-Fang Zhang and Ching-En Lee and C. Liu and Y. Shao and Stephen W. Keckler and Zhengya Zhang},
	journal={2019 Symposium on VLSI Circuits},
	year={2019},
	pages={C306-C307}
}

@INPROCEEDINGS{2012-yu,
	author={D. {Yu} and F. {Seide} and G. {Li} and L. {Deng}},
	booktitle={2012 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
	title={Exploiting sparseness in deep neural networks for large vocabulary speech recognition}, 
	year={2012},
	volume={},
	number={},
	pages={4409-4412},
	doi={10.1109/ICASSP.2012.6288897}}

@inproceedings{10.5555/2986916.2987033,
	author = {Krogh, Anders and Hertz, John A.},
	title = {A Simple Weight Decay Can Improve Generalization},
	year = {1991},
	isbn = {1558602224},
	publisher = {Morgan Kaufmann Publishers Inc.},
	address = {San Francisco, CA, USA},
	abstract = {It has been observed in numerical simulations that a weight decay can improve generalization in a feed-forward neural network. This paper explains why. It is proven that a weight decay has two effects in a linear network. First, it suppresses any irrelevant components of the weight vector by choosing the smallest vector that solves the learning problem. Second, if the size is chosen right, a weight decay can suppress some of the effects of static noise on the targets, which improves generalization quite a lot. It is then shown how to extend these results to networks with hidden layers and non-linear units. Finally the theory is confirmed by some numerical simulations using the data from NetTalk.},
	booktitle = {Proceedings of the 4th International Conference on Neural Information Processing Systems},
	pages = {950–957},
	numpages = {8},
	location = {Denver, Colorado},
	series = {NIPS'91}
}

@misc{2019-niu,
	title={SPEC2: SPECtral SParsE CNN Accelerator on FPGAs}, 
	author={Yue Niu and Hanqing Zeng and Ajitesh Srivastava and Kartik Lakhotia and Rajgopal Kannan and Yanzhi Wang and Viktor Prasanna},
	year={2019},
	eprint={1910.11103},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@inproceedings{rasmussen2001occam,
	title={Occam's razor},
	author={Rasmussen, Carl Edward and Ghahramani, Zoubin},
	booktitle={Advances in neural information processing systems},
	pages={294--300},
	year={2001}
}

@misc{2018-zhu,
	title={SparseNN: An Energy-Efficient Neural Network Accelerator Exploiting Input and Output Sparsity}, 
	author={Jingyang Zhu and Jingbo Jiang and Xizi Chen and Chi-Ying Tsui},
	year={2017},
	eprint={1711.01263},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{issr,
	title={Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra}, 
	author={Paul Scheffler and Florian Zaruba and Fabian Schuiki and Torsten Hoefler and Luca Benini},
	year={2020},
	eprint={2011.08070},
	archivePrefix={arXiv},
	primaryClass={cs.AR}
}

@inproceedings{2020-hegde,
	author = {Hegde, Kartik and Asghari-Moghaddam, Hadi and Pellauer, Michael and Crago, Neal and Jaleel, Aamer and Solomonik, Edgar and Emer, Joel and Fletcher, Christopher W.},
	title = {ExTensor: An Accelerator for Sparse Tensor Algebra},
	year = {2019},
	isbn = {9781450369381},
	publisher = {Association for Computing Machinery},
	address = {New York, NY, USA},
	url = {https://doi.org/10.1145/3352460.3358275},
	doi = {10.1145/3352460.3358275},
	abstract = {Generalized tensor algebra is a prime candidate for acceleration via customized ASICs. Modern tensors feature a wide range of data sparsity, with the density of non-zero elements ranging from 10-6% to 50%. This paper proposes a novel approach to accelerate tensor kernels based on the principle of hierarchical elimination of computation in the presence of sparsity. This approach relies on rapidly finding intersections---situations where both operands of a multiplication are non-zero---enabling new data fetching mechanisms and avoiding memory latency overheads associated with sparse kernels implemented in software.We propose the ExTensor accelerator, which builds these novel ideas on handling sparsity into hardware to enable better bandwidth utilization and compute throughput. We evaluate ExTensor on several kernels relative to industry libraries (Intel MKL) and state-of-the-art tensor algebra compilers (TACO). When bandwidth normalized, we demonstrate an average speedup of 3.4\texttimes{}, 1.3\texttimes{}, 2.8\texttimes{}, 24.9\texttimes{}, and 2.7\texttimes{} on SpMSpM, SpMM, TTV, TTM, and SDDMM kernels respectively over a server class CPU.},
	booktitle = {Proceedings of the 52nd Annual IEEE/ACM International Symposium on Microarchitecture},
	pages = {319–333},
	numpages = {15},
	keywords = {Tensor Algebra, Sparse Computation, Hardware Acceleration},
	location = {Columbus, OH, USA},
	series = {MICRO '52}
}

@INPROCEEDINGS{2017-hill,
	author={P. {Hill} and A. {Jain} and M. {Hill} and B. {Zamirai} and C. {Hsu} and M. A. {Laurenzano} and S. {Mahlke} and L. {Tang} and J. {Mars}},
	booktitle={2017 50th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)}, 
	title={DeftNN: Addressing Bottlenecks for DNN Execution on GPUs via Synapse Vector Elimination and Near-compute Data Fission}, 
	year={2017},
	volume={},
	number={},
	pages={786-799},
	doi={}}

@ARTICLE{2017-kim,
	author={D. {Kim} and J. {Ahn} and S. {Yoo}},
	journal={IEEE Design   Test}, 
	title={ZeNA: Zero-Aware Neural Network Accelerator}, 
	year={2018},
	volume={35},
	number={1},
	pages={39-46},
	doi={10.1109/MDAT.2017.2741463}}

@ARTICLE{2019-li,
	author={J. {Li} and S. {Jiang} and S. {Gong} and J. {Wu} and J. {Yan} and G. {Yan} and X. {Li}},
	journal={IEEE Transactions on Computers}, 
	title={SqueezeFlow: A Sparse CNN Accelerator Exploiting Concise Convolution Rules}, 
	year={2019},
	volume={68},
	number={11},
	pages={1663-1677},
	doi={10.1109/TC.2019.2924215}}

@inproceedings{2019-zhang-eager,
	author = {Zhang, Jiaqi and Chen, Xiangru and Song, Mingcong and Li, Tao},
	title = {Eager Pruning: Algorithm and Architecture Support for Fast Training of Deep Neural Networks},
	year = {2019},
	isbn = {9781450366694},
	publisher = {Association for Computing Machinery},
	address = {New York, NY, USA},
	url = {https://doi.org/10.1145/3307650.3322263},
	doi = {10.1145/3307650.3322263},
	abstract = {Today's big and fast data and the changing circumstance require fast training of Deep Neural Networks (DNN) in various applications. However, training a DNN with tons of parameters involves intensive computation. Enlightened by the fact that redundancy exists in DNNs and the observation that the ranking of the significance of the weights changes slightly during training, we propose Eager Pruning, which speeds up DNN training by moving pruning to an early stage.Eager Pruning is supported by an algorithm and architecture co-design. The proposed algorithm dictates the architecture to identify and prune insignificant weights during training without accuracy loss. A novel architecture is designed to transform the reduced training computation into performance improvement. Our proposed Eager Pruning system gains an average of 1.91x speedup over state-of-the-art hardware accelerator and 6.31x energy-efficiency over Nvidia GPUs.},
	booktitle = {Proceedings of the 46th International Symposium on Computer Architecture},
	pages = {292–303},
	numpages = {12},
	keywords = {neural network training, neural network pruning, software-hardware co-design},
	location = {Phoenix, Arizona},
	series = {ISCA '19}
}


@misc{2018-kung,
	title={Packing Sparse Convolutional Neural Networks for Efficient Systolic Array Implementations: Column Combining Under Joint Optimization}, 
	author={H. T. Kung and Bradley McDanel and Sai Qian Zhang},
	year={2018},
	eprint={1811.04770},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@INPROCEEDINGS{2016-albericio,
	author={J. {Albericio} and P. {Judd} and T. {Hetherington} and T. {Aamodt} and N. E. {Jerger} and A. {Moshovos}},
	booktitle={2016 ACM/IEEE 43rd Annual International Symposium on Computer Architecture (ISCA)}, 
	title={Cnvlutin: Ineffectual-Neuron-Free Deep Neural Network Computing}, 
	year={2016},
	volume={},
	number={},
	pages={1-13},
	doi={10.1109/ISCA.2016.11}}

@misc{a100,
	title={NVIDIA A100 Tensor Core GPU Architecture}, 	
	author={{Nvidia}},
	year={2020},
}

@book{UsingAdvancedMPI,
	author={William Gropp and Torsten Hoefler and Rajeev Thakur and E. Lusk},
	title={{Using Advanced MPI: Modern Features of the Message-Passing Interface}},
	year={2014},
	month={Nov.},
	location={Cambridge, MA},
	publisher={MIT Press},
	isbn={978-0262527637},
	source={http://www.unixer.de/~htor/publications/},
}

@inproceedings{gropp-datatype-performance,
	author={William Gropp and Torsten Hoefler and Rajeev Thakur and Jesper Larsson Träff},
	title={{Performance Expectations and Guidelines for MPI Derived Datatypes}},
	year={2011},
	month={Sep.},
	pages={150-159},
	volume={6960},
	booktitle={Recent Advances in the Message Passing Interface (EuroMPI'11)},
	location={Santorini, Greece},
	publisher={Springer},
	isbn={978-3-642-24448-3},
	source={http://www.unixer.de/~htor/publications/},
}


@misc{2020-zhang,
	title={SpArch: Efficient Architecture for Sparse Matrix Multiplication}, 
	author={Zhekai Zhang and Hanrui Wang and Song Han and William J. Dally},
	year={2020},
	eprint={2002.08947},
	archivePrefix={arXiv},
	primaryClass={cs.AR}
}

@misc{2016-park,
	title={Faster CNNs with Direct Sparse Convolutions and Guided Pruning}, 
	author={Jongsoo Park and Sheng Li and Wei Wen and Ping Tak Peter Tang and Hai Li and Yiran Chen and Pradeep Dubey},
	year={2017},
	eprint={1608.01409},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@article{1996-olshausen,
	title={Emergence of simple-cell receptive field properties by learning a sparse code for natural images},
	author={Olshausen, Bruno A and Field, David J},
	journal={Nature},
	volume={381},
	number={6583},
	pages={607--609},
	year={1996},
	publisher={Nature Publishing Group}
}

@article{1989-janowsky,
	title={Pruning versus clipping in neural networks},
	author={Janowsky, Steven A},
	journal={Physical Review A},
	volume={39},
	number={12},
	pages={6600},
	year={1989},
	publisher={APS}
}

@INPROCEEDINGS{7780804,
	author={A. {Lavin} and S. {Gray}},
	booktitle={2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 
	title={Fast Algorithms for Convolutional Neural Networks}, 
	year={2016},
	volume={},
	number={},
	pages={4013-4021},
	doi={10.1109/CVPR.2016.435}}

@inproceedings{2020-gondimalla,
	author = {Gondimalla, Ashish and Chesnut, Noah and Thottethodi, Mithuna and Vijaykumar, T. N.},
	title = {SparTen: A Sparse Tensor Accelerator for Convolutional Neural Networks},
	year = {2019},
	isbn = {9781450369381},
	publisher = {Association for Computing Machinery},
	address = {New York, NY, USA},
	url = {https://doi.org/10.1145/3352460.3358291},
	doi = {10.1145/3352460.3358291},
	abstract = {Convolutional neural networks (CNNs) are emerging as powerful tools for image processing. Recent machine learning work has reduced CNNs' compute and data volumes by exploiting the naturally-occurring and actively-transformed zeros in the feature maps and filters. While previous semi-sparse architectures exploit one-sided sparsity either in the feature maps or the filters, but not both, a recent fully-sparse architecture, called Sparse CNN (SCNN), exploits two-sided sparsity to improve performance and energy over dense architectures. However, sparse vector-vector dot product, a key primitive in sparse CNNs, would be inefficient using the representation adopted by SCNN. The dot product requires finding and accessing non-zero elements in matching positions in the two sparse vectors -- an inner join using the position as the key with a single value field. SCNN avoids the inner join by performing a Cartesian product capturing the relevant multiplications. However, SCNN's approach incurs several considerable overheads and is not applicable to non-unit-stride convolutions. Further, exploiting reuse in sparse CNNs fundamentally causes systematic load imbalance not addressed by SCNN. We propose SparTen which achieves efficient inner join by providing support for native two-sided sparse execution and memory storage. To tackle load imbalance, SparTen employs a software scheme, called greedy balancing, which groups filters by density via two variants, a software-only one which uses whole-filter density and a software-hardware hybrid which uses finer-grain density. Our simulations show that, on average, SparTen performs 4.7x, 1.8x, and 3x better than a dense architecture, one-sided sparse architecture, and SCNN, respectively. An FPGA implementation shows that SparTen performs 4.3x and 1.9x better than a dense architecture and a one-sided sparse architecture, respectively.},
	booktitle = {Proceedings of the 52nd Annual IEEE/ACM International Symposium on Microarchitecture},
	pages = {151–165},
	numpages = {15},
	keywords = {Convolutional neural networks, Accelerators, Sparse tensors},
	location = {Columbus, OH, USA},
	series = {MICRO '52}
}

@misc{2020-yang,
	title={Procrustes: a Dataflow and Accelerator for Sparse Deep Neural Network Training}, 
	author={Dingqing Yang and Amin Ghasemazar and Xiaowei Ren and Maximilian Golub and Guy Lemieux and Mieszko Lis},
	year={2020},
	eprint={2009.10976},
	archivePrefix={arXiv},
	primaryClass={cs.NE}
}

@INPROCEEDINGS{2018-zhou,
	author={X. {Zhou} and Z. {Du} and Q. {Guo} and S. {Liu} and C. {Liu} and C. {Wang} and X. {Zhou} and L. {Li} and T. {Chen} and Y. {Chen}},
	booktitle={2018 51st Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)}, 
	title={Cambricon-S: Addressing Irregularity in Sparse Neural Networks through A Cooperative Software/Hardware Approach}, 
	year={2018},
	volume={},
	number={},
	pages={15-28},
	doi={10.1109/MICRO.2018.00011}}


@INPROCEEDINGS{2016-zhu,
	author={ Jingyang Zhu and  Zhiliang Qian and  Chi-Ying Tsui},
	booktitle={2016 21st Asia and South Pacific Design Automation Conference (ASP-DAC)}, 
	title={LRADNN: High-throughput and energy-efficient Deep Neural Network accelerator using Low Rank Approximation}, 
	year={2016},
	volume={},
	number={},
	pages={581-586},
	doi={10.1109/ASPDAC.2016.7428074}}

@INPROCEEDINGS{2016-zhang,
	author={S. {Zhang} and Z. {Du} and L. {Zhang} and H. {Lan} and S. {Liu} and L. {Li} and Q. {Guo} and T. {Chen} and Y. {Chen}},
	booktitle={2016 49th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)}, 
	title={Cambricon-X: An accelerator for sparse neural networks}, 
	year={2016},
	volume={},
	number={},
	pages={1-12},
	doi={10.1109/MICRO.2016.7783723}}

@inproceedings{2020-niu,
	author = {Niu, Yue and Kannan, Rajgopal and Srivastava, Ajitesh and Prasanna, Viktor},
	title = {Reuse Kernels or Activations? A Flexible Dataflow for Low-Latency Spectral CNN Acceleration},
	year = {2020},
	isbn = {9781450370998},
	publisher = {Association for Computing Machinery},
	address = {New York, NY, USA},
	url = {https://doi.org/10.1145/3373087.3375302},
	doi = {10.1145/3373087.3375302},
	abstract = {Spectral-domain CNNs have been shown to be more efficient than traditional spatial CNNs in terms of reducing computation complexity. However they come with a 'kernel explosion' problem that, even after compression (pruning), imposes a high memory burden and off-chip bandwidth requirement for kernel access. This creates a performance gap between the potential acceleration offered by compression and actual FPGA implementation performance, especially for low-latency CNN inference. In this paper, we develop a principled approach to overcoming this performance gap and designing a low-latency, low-bandwidth, spectral sparse CNN accelerator on FPGAs. First, we analyze the bandwidth-storage tradeoff of sparse convolutional layers and locate communication bottlenecks. We then develop a dataflow for flexibly optimizing data reuse in different layers to minimize off-chip communication. Finally, we propose a novel scheduling algorithm to optimally schedule the on-chip memory access of multiple sparse kernels and minimize read conflicts. On a state-of-the-art FPGA platform, our design reduces data transfers by 42% with DSP utilization up to 90% and achieves inference latency of 9 ms for VGG16, compared to the baseline state-of-the-art latency of 68 ms.},
	booktitle = {Proceedings of the 2020 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},
	pages = {266–276},
	numpages = {11},
	keywords = {sparse operation, flexible dataflow, spectral cnns, accelerator},
	location = {Seaside, CA, USA},
	series = {FPGA '20}
}


@misc{2019-golub,
	title={Full deep neural network training on a pruned weight budget}, 
	author={Maximilian Golub and Guy Lemieux and Mieszko Lis},
	year={2019},
	eprint={1806.06949},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2019-gupta,
	title={MASR: A Modular Accelerator for Sparse RNNs}, 
	author={Udit Gupta and Brandon Reagen and Lillian Pentecost and Marco Donato and Thierry Tambe and Alexander M. Rush and Gu-Yeon Wei and David Brooks},
	year={2019},
	eprint={1908.08976},
	archivePrefix={arXiv},
	primaryClass={eess.SP}
}

@misc{2017-mao,
	title={Exploring the Regularity of Sparse Structure in Convolutional Neural Networks}, 
	author={Huizi Mao and Song Han and Jeff Pool and Wenshuo Li and Xingyu Liu and Yu Wang and William J. Dally},
	year={2017},
	eprint={1705.08922},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@ARTICLE{2016-chen,
	author={Y. {Chen} and T. {Krishna} and J. S. {Emer} and V. {Sze}},
	journal={IEEE Journal of Solid-State Circuits}, 
	title={Eyeriss: An Energy-Efficient Reconfigurable Accelerator for Deep Convolutional Neural Networks}, 
	year={2017},
	volume={52},
	number={1},
	pages={127-138},
	doi={10.1109/JSSC.2016.2616357}}

@misc{noh2015learning,
	title={Learning Deconvolution Network for Semantic Segmentation}, 
	author={Hyeonwoo Noh and Seunghoon Hong and Bohyung Han},
	year={2015},
	eprint={1505.04366},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2019-chen,
	title={Eyeriss v2: A Flexible Accelerator for Emerging Deep Neural Networks on Mobile Devices}, 
	author={Yu-Hsin Chen and Tien-Ju Yang and Joel Emer and Vivienne Sze},
	year={2019},
	eprint={1807.07928},
	archivePrefix={arXiv},
	primaryClass={cs.DC}
}

@misc{goodfellow2014generative,
	title={Generative Adversarial Networks}, 
	author={Ian J. Goodfellow and Jean Pouget-Abadie and Mehdi Mirza and Bing Xu and David Warde-Farley and Sherjil Ozair and Aaron Courville and Yoshua Bengio},
	year={2014},
	eprint={1406.2661},
	archivePrefix={arXiv},
	primaryClass={stat.ML}
}

@INPROCEEDINGS{2016-reagen,
	author={B. {Reagen} and P. {Whatmough} and R. {Adolf} and S. {Rama} and H. {Lee} and S. K. {Lee} and J. M. {Hernández-Lobato} and G. {Wei} and D. {Brooks}},
	booktitle={2016 ACM/IEEE 43rd Annual International Symposium on Computer Architecture (ISCA)}, 
	title={Minerva: Enabling Low-Power, Highly-Accurate Deep Neural Network Accelerators}, 
	year={2016},
	volume={},
	number={},
	pages={267-278},
	doi={10.1109/ISCA.2016.32}}

@inproceedings{2018-rhu,
  title={Compressing DMA engine: Leveraging activation sparsity for training deep neural networks},
  author={Rhu, Minsoo and O'Connor, Mike and Chatterjee, Niladrish and Pool, Jeff and Kwon, Youngeun and Keckler, Stephen W},
  booktitle={2018 IEEE International Symposium on High Performance Computer Architecture (HPCA)},
  pages={78--91},
  year={2018},
  organization={IEEE}
}


@misc{2017-parashar,
	title={SCNN: An Accelerator for Compressed-sparse Convolutional Neural Networks}, 
	author={Angshuman Parashar and Minsoo Rhu and Anurag Mukkara and Antonio Puglielli and Rangharajan Venkatesan and Brucek Khailany and Joel Emer and Stephen W. Keckler and William J. Dally},
	year={2017},
	eprint={1708.04485},
	archivePrefix={arXiv},
	primaryClass={cs.NE}
}

@misc{dave2020hardware,
	title={Hardware Acceleration of Sparse and Irregular Tensor Computations of ML Models: A Survey and Insights}, 
	author={Shail Dave and Riyadh Baghdadi and Tony Nowatzki and Sasikanth Avancha and Aviral Shrivastava and Baoxin Li},
	year={2020},
	eprint={2007.00864},
	archivePrefix={arXiv},
	primaryClass={cs.AR}
}


@article{2014-collins,
	author    = {Maxwell D. Collins and
	Pushmeet Kohli},
	title     = {Memory Bounded Deep Convolutional Networks},
	journal   = {CoRR},
	volume    = {abs/1412.1442},
	year      = {2014},
	url       = {http://arxiv.org/abs/1412.1442},
	archivePrefix = {arXiv},
	eprint    = {1412.1442},
	timestamp = {Mon, 13 Aug 2018 16:47:16 +0200},
	biburl    = {https://dblp.org/rec/journals/corr/CollinsK14.bib},
	bibsource = {dblp computer science bibliography, https://dblp.org}
}

@misc{2016-han-ese,
	title={ESE: Efficient Speech Recognition Engine with Sparse LSTM on FPGA}, 
	author={Song Han and Junlong Kang and Huizi Mao and Yiming Hu and Xin Li and Yubin Li and Dongliang Xie and Hong Luo and Song Yao and Yu Wang and Huazhong Yang and William J. Dally},
	year={2017},
	eprint={1612.00694},
	archivePrefix={arXiv},
	primaryClass={cs.CL}
}

@misc{2016-see,
	title={Compression of Neural Machine Translation Models via Pruning}, 
	author={Abigail See and Minh-Thang Luong and Christopher D. Manning},
	year={2016},
	eprint={1606.09274},
	archivePrefix={arXiv},
	primaryClass={cs.AI}
}

@misc{2016-wen,
	title={Learning Structured Sparsity in Deep Neural Networks}, 
	author={Wei Wen and Chunpeng Wu and Yandan Wang and Yiran Chen and Hai Li},
	year={2016},
	eprint={1608.03665},
	archivePrefix={arXiv},
	primaryClass={cs.NE}
}

@misc{mdl,
	title={A tutorial introduction to the minimum description length principle},
	author={Grunwald, Peter},
        year={2004},
        eprint={math/0406077},
        archivePrefix={arXiv}
}

@misc{denil2014predicting,
	title={Predicting Parameters in Deep Learning}, 
	author={Misha Denil and Babak Shakibi and Laurent Dinh and Marc'Aurelio Ranzato and Nando de Freitas},
	year={2014},
	eprint={1306.0543},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-savarese,
	title={Winning the Lottery with Continuous Sparsification}, 
	author={Pedro Savarese and Hugo Silva and Michael Maire},
	year={2020},
	eprint={1912.04427},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{neyshabur2018understanding,
	title={Towards Understanding the Role of Over-Parametrization in Generalization of Neural Networks}, 
	author={Behnam Neyshabur and Zhiyuan Li and Srinadh Bhojanapalli and Yann LeCun and Nathan Srebro},
	year={2018},
	eprint={1805.12076},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{allenzhu2019convergence,
	title={A Convergence Theory for Deep Learning via Over-Parameterization}, 
	author={Zeyuan Allen-Zhu and Yuanzhi Li and Zhao Song},
	year={2019},
	eprint={1811.03962},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2017-luo,
	title={ThiNet: A Filter Level Pruning Method for Deep Neural Network Compression}, 
	author={Jian-Hao Luo and Jianxin Wu and Weiyao Lin},
	year={2017},
	eprint={1707.06342},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2017-han,
	title={DSD: Dense-Sparse-Dense Training for Deep Neural Networks}, 
	author={Song Han and Jeff Pool and Sharan Narang and Huizi Mao and Enhao Gong and Shijian Tang and Erich Elsen and Peter Vajda and Manohar Paluri and John Tran and Bryan Catanzaro and William J. Dally},
	year={2017},
	eprint={1607.04381},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2017-neklyudov,
	title={Structured Bayesian Pruning via Log-Normal Multiplicative Noise}, 
	author={Kirill Neklyudov and Dmitry Molchanov and Arsenii Ashukha and Dmitry Vetrov},
	year={2017},
	eprint={1705.07283},
	archivePrefix={arXiv},
	primaryClass={stat.ML}
}

@misc{2020-azarian,
	title={Learned Threshold Pruning}, 
	author={Kambiz Azarian and Yash Bhalgat and Jinwon Lee and Tijmen Blankevoort},
	year={2020},
	eprint={2003.00075},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2017-srinivas,
	title={Training Sparse Neural Networks}, 
	author={Suraj Srinivas and Akshayvarun Subramanya and R. Venkatesh Babu},
	year={2016},
	eprint={1611.06694},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2017-he,
	title={Channel Pruning for Accelerating Very Deep Neural Networks}, 
	author={Yihui He and Xiangyu Zhang and Jian Sun},
	year={2017},
	eprint={1707.06168},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}


@ARTICLE{2018-dey,
	author={S. {Dey} and K. {Huang} and P. A. {Beerel} and K. M. {Chugg}},
	journal={IEEE Journal on Emerging and Selected Topics in Circuits and Systems}, 
	title={Pre-Defined Sparse Neural Networks With Hardware Acceleration}, 
	year={2019},
	volume={9},
	number={2},
	pages={332-345},
	doi={10.1109/JETCAS.2019.2910864}}

@misc{2017-bourely,
	title={Sparse Neural Networks Topologies}, 
	author={Alfred Bourely and John Patrick Boueri and Krzysztof Choromonski},
	year={2017},
	eprint={1706.05683},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2017-prabhu,
	title={Deep Expander Networks: Efficient Deep Networks from Graph Theory}, 
	author={Ameya Prabhu and Girish Varma and Anoop Namboodiri},
	year={2018},
	eprint={1711.08757},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@article{2018-manessi,
	title={Automated Pruning for Deep Neural Network Compression},
	ISBN={9781538637883},
	url={http://dx.doi.org/10.1109/ICPR.2018.8546129},
	DOI={10.1109/icpr.2018.8546129},
	journal={2018 24th International Conference on Pattern Recognition (ICPR)},
	publisher={IEEE},
	author={Manessi, Franco and Rozza, Alessandro and Bianco, Simone and Napoletano, Paolo and Schettini, Raimondo},
	year={2018},
	month={Aug}
}

@misc{2017-yang,
	title={Designing Energy-Efficient Convolutional Neural Networks using Energy-Aware Pruning}, 
	author={Tien-Ju Yang and Yu-Hsin Chen and Vivienne Sze},
	year={2017},
	eprint={1611.05128},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2017-liu,
	title={Learning Efficient Convolutional Networks through Network Slimming}, 
	author={Zhuang Liu and Jianguo Li and Zhiqiang Shen and Gao Huang and Shoumeng Yan and Changshui Zhang},
	year={2017},
	eprint={1708.06519},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2017-ullrich,
	title={Soft Weight-Sharing for Neural Network Compression}, 
	author={Karen Ullrich and Edward Meeds and Max Welling},
	year={2017},
	eprint={1702.04008},
	archivePrefix={arXiv},
	primaryClass={stat.ML}
}

@misc{2016-jin,
	title={Training Skinny Deep Neural Networks with Iterative Hard Thresholding Methods}, 
	author={Xiaojie Jin and Xiaotong Yuan and Jiashi Feng and Shuicheng Yan},
	year={2016},
	eprint={1607.05423},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@article{2019-zhang-compact,
	author = {Zhang, Jeff (Jun) and Raj, Parul and Zarar, Shuayb and Ambardekar, Amol and Garg, Siddharth},
	title = {CompAct: On-Chip ComPression of ActIvations for Low Power Systolic Array Based CNN Acceleration},
	year = {2019},
	issue_date = {October 2019},
	publisher = {Association for Computing Machinery},
	address = {New York, NY, USA},
	volume = {18},
	number = {5s},
	issn = {1539-9087},
	url = {https://doi.org/10.1145/3358178},
	doi = {10.1145/3358178},
	abstract = {This paper addresses the design of systolic array (SA) based convolutional neural network (CNN) accelerators for mobile and embedded domains. On- and off-chip memory accesses to the large activation inputs (sometimes called feature maps) of CNN layers contribute significantly to total energy consumption for such accelerators; while prior has proposed off-chip compression, activations are still stored on-chip in uncompressed form, requiring either large on-chip activation buffers or slow and energy-hungry off-chip accesses. In this paper, we propose CompAct, a new architecture that enables on-chip compression of activations for SA based CNN accelerators. CompAct is built around several key ideas. First, CompAct identifies an SA schedule that has nearly regular access patterns, enabling the use of a modified run-length coding scheme (RLC). Second, CompAct improves compression ratio of the RLC scheme using Sparse-RLC in later CNN layers and Lossy-RLC in earlier layers. Finally, CompAct proposes look-ahead snoozing that operates synergistically with RLC to reduce the leakage energy of activation buffers. Based on detailed synthesis results, we show that CompAct enables up to 62% reduction in activation buffer energy, and 34% reduction in total chip energy.},
	journal = {ACM Trans. Embed. Comput. Syst.},
	month = oct,
	articleno = {47},
	numpages = {24},
	keywords = {systolic arrays, low-power design, Deep neural networks}
}


@article{2016-scardapane,
	title = "Group sparse regularization for deep neural networks",
	journal = "Neurocomputing",
	volume = "241",
	pages = "81 - 89",
	year = "2017",
	issn = "0925-2312",
	doi = "https://doi.org/10.1016/j.neucom.2017.02.029",
	url = "http://www.sciencedirect.com/science/article/pii/S0925231217302990",
	author = "Simone Scardapane and Danilo Comminiello and Amir Hussain and Aurelio Uncini",
	keywords = "Deep networks, Group sparsity, Pruning, Feature selection",
	abstract = "In this paper, we address the challenging task of simultaneously optimizing (i) the weights of a neural network, (ii) the number of neurons for each hidden layer, and (iii) the subset of active input features (i.e., feature selection). While these problems are traditionally dealt with separately, we propose an efficient regularized formulation enabling their simultaneous parallel execution, using standard optimization routines. Specifically, we extend the group Lasso penalty, originally proposed in the linear regression literature, to impose group-level sparsity on the network’s connections, where each group is defined as the set of outgoing weights from a unit. Depending on the specific case, the weights can be related to an input variable, to a hidden neuron, or to a bias unit, thus performing simultaneously all the aforementioned tasks in order to obtain a compact network. We carry out an extensive experimental evaluation, in comparison with classical weight decay and Lasso penalties, both on a toy dataset for handwritten digit recognition, and multiple realistic mid-scale classification benchmarks. Comparative results demonstrate the potential of our proposed sparse group Lasso penalty in producing extremely compact networks, with a significantly lower number of input features, with a classification accuracy which is equal or only slightly inferior to standard regularization terms."
}

@misc{2017-changpinyo,
	title={The Power of Sparsity in Convolutional Neural Networks}, 
	author={Soravit Changpinyo and Mark Sandler and Andrey Zhmoginov},
	year={2017},
	eprint={1702.06257},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@inproceedings{2016-zhou,
	title={Less is more: Towards compact cnns},
	author={Zhou, Hao and Alvarez, Jose M and Porikli, Fatih},
	booktitle={European Conference on Computer Vision},
	pages={662--677},
	year={2016},
	organization={Springer}
}

@inproceedings{im2col,
	title={High performance convolutional neural networks for document processing},
	author={Chellapilla, Kumar and Puri, Sidd and Simard, Patrice},
	year={2006}
}

@misc{2015-lebedev,
	title={Fast ConvNets Using Group-wise Brain Damage}, 
	author={Vadim Lebedev and Victor Lempitsky},
	year={2015},
	eprint={1506.02515},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2018-mittal,
	title={Recovering from Random Pruning: On the Plasticity of Deep Convolutional Neural Networks}, 
	author={Deepak Mittal and Shweta Bhardwaj and Mitesh M. Khapra and Balaraman Ravindran},
	year={2018},
	eprint={1801.10447},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2015-tompson,
	title={Efficient Object Localization Using Convolutional Networks}, 
	author={Jonathan Tompson and Ross Goroshin and Arjun Jain and Yann LeCun and Christopher Bregler},
	year={2015},
	eprint={1411.4280},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2015-mariet,
	title={Diversity Networks: Neural Network Compression Using Determinantal Point Processes}, 
	author={Zelda Mariet and Suvrit Sra},
	year={2017},
	eprint={1511.05077},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@ARTICLE{2015-polyak,
	author={A. {Polyak} and L. {Wolf}},
	journal={IEEE Access}, 
	title={Channel-level acceleration of deep face representations}, 
	year={2015},
	volume={3},
	number={},
	pages={2163-2175},
	doi={10.1109/ACCESS.2015.2494536}}

@article{2015-anwar,
	title={Structured pruning of deep convolutional neural networks},
	author={Anwar, Sajid and Hwang, Kyuyeon and Sung, Wonyong},
	journal={ACM Journal on Emerging Technologies in Computing Systems (JETC)},
	volume={13},
	number={3},
	pages={1--18},
	year={2017},
	publisher={ACM New York, NY, USA}
}

@misc{2015-srinivas-relu,
	title={Learning Neural Network Architectures using Backpropagation}, 
	author={Suraj Srinivas and R. Venkatesh Babu},
	year={2016},
	eprint={1511.05497},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@inproceedings{2011-glorot,
	title={Deep sparse rectifier neural networks},
	author={Glorot, Xavier and Bordes, Antoine and Bengio, Yoshua},
	booktitle={Proceedings of the fourteenth international conference on artificial intelligence and statistics},
	pages={315--323},
	year={2011}
}

@inproceedings{2010-glorot-init,
	added-at = {2019-05-29T00:00:00.000+0200},
	author = {Glorot, Xavier and Bengio, Yoshua},
	biburl = {https://www.bibsonomy.org/bibtex/221d2d1490c8404f823f1d36b294fce72/dblp},
	booktitle = {AISTATS},
	editor = {Teh, Yee Whye and Titterington, D. Mike},
	ee = {http://proceedings.mlr.press/v9/glorot10a.html},
	interhash = {4f45a520bb65b6045bd237963ffee0ed},
	intrahash = {21d2d1490c8404f823f1d36b294fce72},
	keywords = {dblp},
	pages = {249-256},
	publisher = {JMLR.org},
	series = {JMLR Proceedings},
	timestamp = {2019-05-30T11:50:49.000+0200},
	title = {Understanding the difficulty of training deep feedforward neural networks.},
	url = {http://dblp.uni-trier.de/db/journals/jmlr/jmlrp9.html#GlorotB10},
	volume = 9,
	year = 2010
}

@misc{2015-he-init,
	title={Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification}, 
	author={Kaiming He and Xiangyu Zhang and Shaoqing Ren and Jian Sun},
	year={2015},
	eprint={1502.01852},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@article{2008-narasimha,
	title = "An integrated growing-pruning method for feedforward network training",
	journal = "Neurocomputing",
	volume = "71",
	number = "13",
	pages = "2831 - 2847",
	year = "2008",
	note = "Artificial Neural Networks (ICANN 2006) / Engineering of Intelligent Systems (ICEIS 2006)",
	issn = "0925-2312",
	doi = "https://doi.org/10.1016/j.neucom.2007.08.026",
	url = "http://www.sciencedirect.com/science/article/pii/S0925231207003086",
	author = "Pramod L. Narasimha and Walter H. Delashmit and Michael T. Manry and Jiang Li and Francisco Maldonado",
	keywords = "Growing, Pruning, Cascade correlation, Back propagation, Output weight optimization–Hidden weight optimization",
	abstract = "In order to facilitate complexity optimization in feedforward networks, several algorithms are developed that combine growing and pruning. First, a growing scheme is presented which iteratively adds new hidden units to full-trained networks. Then, a non-heuristic one-pass pruning technique is presented, which utilizes orthogonal least squares. Based upon pruning, a one-pass approach is developed for generating the validation error versus network size curve. A combined approach is described in which networks are continually pruned during the growing process. As a result, the hidden units are ordered according to their usefulness, and the least useful units are eliminated. Examples show that networks designed using the combined method have less training and validation error than growing or pruning alone. The combined method exhibits reduced sensitivity to the initial weights and generates an almost monotonic error versus network size curve. It is shown to perform better than two well-known growing methods—constructive backpropagation and cascade correlation."
}

@article{1997-prechelt,
	title = "Connection pruning with static and adaptive pruning schedules",
	journal = "Neurocomputing",
	volume = "16",
	number = "1",
	pages = "49 - 61",
	year = "1997",
	issn = "0925-2312",
	doi = "https://doi.org/10.1016/S0925-2312(96)00054-9",
	url = "http://www.sciencedirect.com/science/article/pii/S0925231296000549",
	author = "Lutz Prechelt",
	keywords = "Empirical study, Pruning, Early stopping, Generalization",
	abstract = "Neural network pruning methods on the level of individual network parameters (e.g. connection weights) can improve generalization, as is shown in this empirical study. However, an open problem in the pruning methods known today (e.g. OBD, OBS, autoprune, epsiprune) is the selection of the number of parameters to be removed in each pruning step (pruning strength). This work presents a pruning method lprune that automatically adapts the pruning strength to the evolution of weights and loss of generalization during training. The method requires no algorithm parameter adjustment by the user. Results of statistical significance tests comparing autoprune, lprune, and static networks with early stopping are given, based on extensive experimentation with 14 different problems. The results indicate that training with pruning is often significantly better and rarely significantly worse than training with early stopping without pruning. Furthermore, lprune is often superior to autoprune (which is superior to OBD) on diagnosis tasks unless severe pruning early in the training process is required."
}

@article{1996-cibas,
	title = "Variable selection with neural networks",
	journal = "Neurocomputing",
	volume = "12",
	number = "2",
	pages = "223 - 248",
	year = "1996",
	note = "Current European Neurocomputing Research",
	issn = "0925-2312",
	doi = "https://doi.org/10.1016/0925-2312(95)00121-2",
	url = "http://www.sciencedirect.com/science/article/pii/0925231295001212",
	author = "Tautvydas Cibas and Françoise Fogelman Soulié and Patrick Gallinari and Sarunas Raudys",
	keywords = "Variable selection, Regularization, Neural network pruning, Dimensionality reduction",
	abstract = "In this paper, we present 3 different neural network-based methods to perform variable selection. OCD — Optimal Cell Damage — is a pruning method, which evaluates the usefulness of a variable and prunes the least useful ones (it is related to the Optimal Brain Damage method of Le Cun et al.). Regularization theory proposes to constrain estimators by adding a term to the cost function used to train a neural network. In the Bayesian framework, this additional term can be interpreted as the log prior to the weights distribution. We propose to use two priors (a Gaussian and a Gaussian mixture) and show that this regularization approach allows to select efficient subsets of variables. Our methods are compared to conventional statistical selection procedures and are shown to significantly improve on that."
}

@book{grunwald2007minimum,
  title={The minimum description length principle},
  author={Gr{\"u}nwald, Peter D and Grunwald, Abhijit},
  year={2007},
  publisher={MIT press}
}

@INPROCEEDINGS{713928,
	author={P. {Burrascano}},
	booktitle={Proceedings of 1993 International Conference on Neural Networks (IJCNN-93-Nagoya, Japan)}, 
	title={A pruning technique maximizing generalization}, 
	year={1993},
	volume={1},
	number={},
	pages={347-350 vol.1},
	doi={10.1109/IJCNN.1993.713928}}

@inproceedings{NIPS1995_3473decc,
	author = {Pedersen, Morten and Hansen, Lars and Larsen, Jan},
	booktitle = {Advances in Neural Information Processing Systems},
	editor = {D. Touretzky and M. C. Mozer and M. Hasselmo},
	pages = {521--527},
	publisher = {MIT Press},
	title = {Pruning with generalization based weight saliencies: lambda OBD, lambda OBS},
	url = {https://proceedings.neurips.cc/paper/1995/file/3473decccb0509fb264818a7512a8b9b-Paper.pdf},
	volume = {8},
	year = {1996}
}


@INPROCEEDINGS{1993-tamura,
	author={S. {Tamura} and M. {Tateishi} and M. {Matumoto} and S. {Akita}},
	booktitle={Proceedings of 1993 International Conference on Neural Networks (IJCNN-93-Nagoya, Japan)}, 
	title={Determination of the number of redundant hidden units in a three-layered feedforward neural network}, 
	year={1993},
	volume={1},
	number={},
	pages={335-338 vol.1},
	doi={10.1109/IJCNN.1993.713925}}

@inproceedings{10.5555/646365.691221,
	author = {White, David and Ligomenides, Panos A.},
	title = {GANNet: A Genetic Algorithm for Optimizing Topology and Weights in Neural Network Design},
	year = {1993},
	isbn = {3540567984},
	publisher = {Springer-Verlag},
	address = {Berlin, Heidelberg},
	booktitle = {Proceedings of the International Workshop on Artificial Neural Networks: New Trends in Neural Computation},
	pages = {322–327},
	numpages = {6},
	series = {IWANN '93}
}

@inproceedings{whitley:ijcnn90,
	added-at = {2008-03-11T14:52:34.000+0100},
	author = {Whitley, D. and Bogart, C.},
	biburl = {https://www.bibsonomy.org/bibtex/2735ce53b3256af7931f2391503645044/idsia},
	booktitle = {{P}roceedings of the International Joint Conference on Neural Networks {\rm ({W}ashington, {DC})}},
	citeulike-article-id = {2379763},
	interhash = {f825f0c086e007e378048c52ec98afa7},
	intrahash = {735ce53b3256af7931f2391503645044},
	keywords = {nn},
	pages = {134--137},
	priority = {2},
	publisher = {IEEE Press},
	timestamp = {2008-03-11T14:57:16.000+0100},
	title = {The Evolution of Connectivity: {P}runing Neural Networks Using Genetic Algorithms},
	year = 1990
}


@inproceedings{NIPS1988_1c9ac015,
	author = {Hanson, Stephen and Pratt, Lorien},
	booktitle = {Advances in Neural Information Processing Systems},
	editor = {D. Touretzky},
	pages = {177--185},
	publisher = {Morgan-Kaufmann},
	title = {Comparing Biases for Minimal Network Construction with Back-Propagation},
	url = {https://proceedings.neurips.cc/paper/1988/file/1c9ac0159c94d8d0cbedc973445af2da-Paper.pdf},
	volume = {1},
	year = {1989}
}


@ARTICLE{2001-engelbrecht,
	author={A. P. {Engelbrecht}},
	journal={IEEE Transactions on Neural Networks}, 
	title={A new pruning heuristic based on variance analysis of sensitivity information}, 
	year={2001},
	volume={12},
	number={6},
	pages={1386-1399},
	doi={10.1109/72.963775}}

@INPROCEEDINGS{2001-suzuki,
	author = {Kenji Suzuki and Isao Horiba and Noboru Sugie},
	title = {A simple neural network pruning algorithm with application to filter synthesis},
	booktitle = {Neural Processing Letters},
	year = {2001},
	pages = {43--53}
}

@article{2000-chandrasekaran,
	title = "Pruning of basis functions in nonlinear approximators",
	journal = "Neurocomputing",
	volume = "34",
	number = "1",
	pages = "29 - 53",
	year = "2000",
	issn = "0925-2312",
	doi = "https://doi.org/10.1016/S0925-2312(00)00311-8",
	url = "http://www.sciencedirect.com/science/article/pii/S0925231200003118",
	author = "Hema Chandrasekaran and Hung-Han Chen and Michael T. Manry",
	keywords = "Nonlinear networks, Multilayer perceptron, Fast pruning, Optimal brain surgeon, OBS, Conjugate gradient method, Hidden unit pruning, Generalization",
	abstract = "To obtain both good training performance and good generalization in multilayer perceptron (MLP) networks, it is essential to use small networks that avoid overfitting the training data. A common approach for doing this is to train a large network and then to prune the unnecessary units or weights. An effective hidden unit-pruning algorithm called linear dependence (LD) pruning utilizing sets of linear equations is presented in this paper. In this approach, hidden unit outputs (basis functions) are modeled as a linear combination of outputs of other units. The least useful hidden unit is identified as that which is predicted to increase training error the least when replaced by its model. After this hidden unit is found, the new pruning algorithm replaces it with its model and retrains the network output weights by one iteration of training. The LD pruning algorithm's performance is compared with that of a modified optimal brain surgeon (OBS) pruning algorithm. We show that the LD pruning algorithm performs as well as the OBS method, yet requires orders of magnitude fewer multiplies."
}

@INPROCEEDINGS{1998-fletcher,
	author={L. {Fletcher} and V. {Katkovnik} and F. E. {Steffens} and A. P. {Engelbrecht}},
	booktitle={1998 IEEE International Joint Conference on Neural Networks Proceedings. IEEE World Congress on Computational Intelligence (Cat. No.98CH36227)}, 
	title={Optimizing the number of hidden nodes of a feedforward artificial neural network}, 
	year={1998},
	volume={2},
	number={},
	pages={1608-1612 vol.2},
	doi={10.1109/IJCNN.1998.686018}}

@ARTICLE{1997-castellano,
	author={G. {Castellano} and A. M. {Fanelli} and M. {Pelillo}},
	journal={IEEE Transactions on Neural Networks}, 
	title={An iterative pruning algorithm for feedforward neural networks}, 
	year={1997},
	volume={8},
	number={3},
	pages={519-531},
	doi={10.1109/72.572092}}

@article{2000-castellano,
	title={Variable selection using neural-network models},
	author={Castellano, Giovanna and Fanelli, Anna Maria},
	journal={Neurocomputing},
	volume={31},
	number={1-4},
	pages={1--13},
	year={2000},
	publisher={Elsevier}
}

@misc{2020-atashgahi,
	title={Quick and Robust Feature Selection: the Strength of Energy-efficient Sparse Training for Autoencoders}, 
	author={Zahra Atashgahi and Ghada Sokar and Tim van der Lee and Elena Mocanu and Decebal Constantin Mocanu and Raymond Veldhuis and Mykola Pechenizkiy},
	year={2020},
	eprint={2012.00560},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@article{1994-hagiwara,
	title = "A simple and effective method for removal of hidden units and weights",
	journal = "Neurocomputing",
	volume = "6",
	number = "2",
	pages = "207 - 218",
	year = "1994",
	note = "Backpropagation, Part IV",
	issn = "0925-2312",
	doi = "https://doi.org/10.1016/0925-2312(94)90055-8",
	url = "http://www.sciencedirect.com/science/article/pii/0925231294900558",
	author = "Masafumi Hagiwara",
	abstract = "The objective of this paper is to present a simple and effective method for removal of both hidden units and weights. In this paper, we propose two methods, the ‘Consuming energy’ method and the ‘Weights power’ method, and compare them with the conventional method. According to our computer simulations using the mirror symmetry problem, the Weights power method has shown the best performance with respect to size reduction (removal of units and weights), generalization performance, and the amount of computation required. For example, the number of hidden units reduced to about 40% of the initial state, and the number of weights reduced to less than a fourth of the initial state. In addition, generalization performance was improved more than 10%."
}

@INPROCEEDINGS{1988-sietsma,
	author={ {Sietsma} and  {Dow}},
	booktitle={IEEE 1988 International Conference on Neural Networks}, 
	title={Neural net pruning-why and how}, 
	year={1988},
	volume={},
	number={},
	pages={325-333 vol.1},
	doi={10.1109/ICNN.1988.23864}}

@misc{2019-frankle,
	title={The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks}, 
	author={Jonathan Frankle and Michael Carbin},
	year={2019},
	eprint={1803.03635},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{gopalakrishnan2018combating,
	title={Combating Adversarial Attacks Using Sparse Representations}, 
	author={Soorya Gopalakrishnan and Zhinus Marzi and Upamanyu Madhow and Ramtin Pedarsani},
	year={2018},
	eprint={1803.03880},
	archivePrefix={arXiv},
	primaryClass={stat.ML}
}

@article{1991-sietsma,
	title={Creating artificial neural networks that generalize},
	author={Sietsma, Jocelyn and Dow, Robert JF},
	journal={Neural networks},
	volume={4},
	number={1},
	pages={67--79},
	year={1991},
	publisher={Elsevier}
}

@ARTICLE{1993-kanjilal,
	author={P. P. {Kanjilal} and P. K. {Dey} and D. N. {Banerjee}},
	journal={Electronics Letters}, 
	title={Reduced-size neural networks through singular value decomposition and subset selection}, 
	year={1993},
	volume={29},
	number={17},
	pages={1516-1518},
	doi={10.1049/el:19931010}}

@INPROCEEDINGS{1991-kameyama,
	author={K. {Kameyama} and Y. {Kosugi}},
	booktitle={Conference Proceedings 1991 IEEE International Conference on Systems, Man, and Cybernetics}, 
	title={Automatic fusion and splitting of artificial neural elements in optimizing the network size}, 
	year={1991},
	volume={},
	number={},
	pages={1633-1638 vol.3},
	doi={10.1109/ICSMC.1991.169926}}

@inproceedings{1993-hagiwara,
	title={Removal of hidden units and weights for back propagation networks},
	author={Hagiwara, Masafumi},
	booktitle={Proceedings of 1993 International Conference on Neural Networks (IJCNN-93-Nagoya, Japan)},
	volume={1},
	pages={351--354},
	year={1993},
	organization={IEEE}
}

@misc{ramanujan2020whats,
	title={What's Hidden in a Randomly Weighted Neural Network?}, 
	author={Vivek Ramanujan and Mitchell Wortsman and Aniruddha Kembhavi and Ali Farhadi and Mohammad Rastegari},
	year={2020},
	eprint={1911.13299},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2020-baalen,
	title={Bayesian Bits: Unifying Quantization and Pruning}, 
	author={Mart van Baalen and Christos Louizos and Markus Nagel and Rana Ali Amjad and Ying Wang and Tijmen Blankevoort and Max Welling},
	year={2020},
	eprint={2005.07093},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-orseau,
	title={Logarithmic Pruning is All You Need}, 
	author={Laurent Orseau and Marcus Hutter and Omar Rivasplata},
	year={2020},
	eprint={2006.12156},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-malach,
	title={Proving the Lottery Ticket Hypothesis: Pruning is All You Need}, 
	author={Eran Malach and Gilad Yehudai and Shai Shalev-Shwartz and Ohad Shamir},
	year={2020},
	eprint={2002.00585},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@inproceedings{10.1145/3386263.3407651,
	author = {Rakin, Adnan Siraj and He, Zhezhi and Yang, Li and Wang, Yanzhi and Wang, Liqiang and Fan, Deliang},
	title = {Robust Sparse Regularization: Defending Adversarial Attacks Via Regularized Sparse Network},
	year = {2020},
	isbn = {9781450379441},
	publisher = {Association for Computing Machinery},
	address = {New York, NY, USA},
	url = {https://doi.org/10.1145/3386263.3407651},
	doi = {10.1145/3386263.3407651},
	abstract = {Deep Neural Network (DNN) trained by the gradient descent method is known to be vulnerable to maliciously perturbed adversarial input, aka. adversarial attack. As one of the countermeasures against adversarial attacks, increasing the model capacity for DNN robustness enhancement was discussed and reported as an effective approach by many recent works. In this work, we show that shrinking the model size through proper weight pruning can even be helpful to improve the DNN robustness under adversarial attack. For obtaining a simultaneously robust and compact DNN model, we propose a multi-objective training method called Robust Sparse Regularization (RSR), through the fusion of various regularization techniques, including channel-wise noise injection, lasso weight penalty, and adversarial training. We conduct extensive experiments to show the effectiveness of RSR against popular white-box (i.e., PGD and FGSM) and black-box attacks. Thanks to RSR, 85 % weight connections of ResNet-18 can be pruned while still achieving 0.68 % and 8.72 % improvement in clean- and perturbed-data accuracy respectively on CIFAR-10 dataset, in comparison to its PGD adversarial training baseline.},
	booktitle = {Proceedings of the 2020 on Great Lakes Symposium on VLSI},
	pages = {125–130},
	numpages = {6},
	keywords = {sparse, adversarial defense, robust},
	location = {Virtual Event, China},
	series = {GLSVLSI '20}
}


@inproceedings{guo2018sparse,
	title={Sparse dnns with improved adversarial robustness},
	author={Guo, Yiwen and Zhang, Chao and Zhang, Changshui and Chen, Yurong},
	booktitle={Advances in neural information processing systems},
	pages={242--251},
	year={2018}
}

@misc{2019-frankle-b,
	title={Stabilizing the Lottery Ticket Hypothesis}, 
	author={Jonathan Frankle and Gintare Karolina Dziugaite and Daniel M. Roy and Michael Carbin},
	year={2020},
	eprint={1903.01611},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2019-zhou,
	title={Deconstructing Lottery Tickets: Zeros, Signs, and the Supermask}, 
	author={Hattie Zhou and Janice Lan and Rosanne Liu and Jason Yosinski},
	year={2020},
	eprint={1905.01067},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@article{2005-zeng,
	title={Hidden neuron pruning of multilayer perceptrons using a quantified sensitivity measure},
	author={Zeng, Xiaoqin and Yeung, Daniel S},
	journal={Neurocomputing},
	volume={69},
	number={7-9},
	pages={825--837},
	year={2006},
	publisher={Elsevier}
}

@article{2006-lauret,
	title={A node pruning algorithm based on a Fourier amplitude sensitivity test method},
	author={Lauret, Philippe and Fock, Eric and Mara, Thierry Alex},
	journal={IEEE transactions on neural networks},
	volume={17},
	number={2},
	pages={273--293},
	year={2006},
	publisher={IEEE}
}

@article{1999-zhou,
	title={Subset-based training and pruning of sigmoid neural networks},
	author={Zhou, Guian and Si, Jennie},
	journal={Neural networks},
	volume={12},
	number={1},
	pages={79--89},
	year={1999},
	publisher={Elsevier}
}

@article{2006-xu,
	title={A new training and pruning algorithm based on node dependence and Jacobian rank deficiency},
	author={Xu, Jinhua and Ho, Daniel WC},
	journal={Neurocomputing},
	volume={70},
	number={1-3},
	pages={544--558},
	year={2006},
	publisher={Elsevier}
}

@misc{mccandlish2018empirical,
	title={An Empirical Model of Large-Batch Training}, 
	author={Sam McCandlish and Jared Kaplan and Dario Amodei and OpenAI Dota Team},
	year={2018},
	eprint={1812.06162},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2015-ioffe,
	title={Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift}, 
	author={Sergey Ioffe and Christian Szegedy},
	year={2015},
	eprint={1502.03167},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@article {Herculano-Houzel19008,
	author = {Herculano-Houzel, Suzana and Mota, Bruno and Wong, Peiyan and Kaas, Jon H.},
	title = {Connectivity-driven white matter scaling and folding in primate cerebral cortex},
	volume = {107},
	number = {44},
	pages = {19008--19013},
	year = {2010},
	doi = {10.1073/pnas.1012590107},
	publisher = {National Academy of Sciences},
	abstract = {Larger brains have an increasingly folded cerebral cortex whose white matter scales up faster than the gray matter. Here we analyze the cellular composition of the subcortical white matter in 11 primate species, including humans, and one Scandentia, and show that the mass of the white matter scales linearly across species with its number of nonneuronal cells, which is expected to be proportional to the total length of myelinated axons in the white matter. This result implies that the average axonal cross-section area in the white matter, a, does not scale significantly with the number of neurons in the gray matter, N. The surface area of the white matter increases with N0.87, not N1.0. Because this surface can be defined as the product of N, a, and the fraction n of cortical neurons connected through the white matter, we deduce that connectivity decreases in larger cerebral cortices as a slowly diminishing fraction of neurons, which varies with N-0.16, sends myelinated axons into the white matter. Decreased connectivity is compatible with previous suggestions that neurons in the cerebral cortex are connected as a small-world network and should slow down the increase in global conduction delay in cortices with larger numbers of neurons. Further, a simple model shows that connectivity and cortical folding are directly related across species. We offer a white matter-based mechanism to account for increased cortical folding across species, which we propose to be driven by connectivity-related tension in the white matter, pulling down on the gray matter.},
	issn = {0027-8424},
	URL = {https://www.pnas.org/content/107/44/19008},
	eprint = {https://www.pnas.org/content/107/44/19008.full.pdf},
	journal = {Proceedings of the National Academy of Sciences}
}

@misc{zhang2017understanding,
	title={Understanding deep learning requires rethinking generalization}, 
	author={Chiyuan Zhang and Samy Bengio and Moritz Hardt and Benjamin Recht and Oriol Vinyals},
	year={2017},
	eprint={1611.03530},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{du2019gradient,
	title={Gradient Descent Provably Optimizes Over-parameterized Neural Networks}, 
	author={Simon S. Du and Xiyu Zhai and Barnabas Poczos and Aarti Singh},
	year={2019},
	eprint={1810.02054},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{brutzkus2017sgd,
	title={SGD Learns Over-parameterized Networks that Provably Generalize on Linearly Separable Data}, 
	author={Alon Brutzkus and Amir Globerson and Eran Malach and Shai Shalev-Shwartz},
	year={2017},
	eprint={1710.10174},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@inproceedings{glorot2011deep,
	title={Deep sparse rectifier neural networks},
	author={Glorot, Xavier and Bordes, Antoine and Bengio, Yoshua},
	booktitle={Proceedings of the fourteenth international conference on artificial intelligence and statistics},
	pages={315--323},
	year={2011}
}

@misc{mhaskar2016deep,
	title={Deep vs. shallow networks : An approximation theory perspective}, 
	author={Hrushikesh Mhaskar and Tomaso Poggio},
	year={2016},
	eprint={1608.03287},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@article{10.1145/356616.356618,
	author = {Pooch, Udo W. and Nieder, Al},
	title = {A Survey of Indexing Techniques for Sparse Matrices},
	year = {1973},
	issue_date = {June 1973},
	publisher = {Association for Computing Machinery},
	address = {New York, NY, USA},
	volume = {5},
	number = {2},
	issn = {0360-0300},
	url = {https://doi.org/10.1145/356616.356618},
	doi = {10.1145/356616.356618},
	journal = {ACM Comput. Surv.},
	month = jun,
	pages = {109–133},
	numpages = {25}
}

@misc{elsken2019neural,
	title={Neural Architecture Search: A Survey}, 
	author={Thomas Elsken and Jan Hendrik Metzen and Frank Hutter},
	year={2019},
	eprint={1808.05377},
	archivePrefix={arXiv},
	primaryClass={stat.ML}
}

@article{1988-mozer,
	title={Skeletonization: A technique for trimming the fat from a network via relevance assessment},
	author={Mozer, Michael C and Smolensky, Paul},
	journal={Advances in neural information processing systems},
	volume={1},
	pages={107--115},
	year={1988}
}

@article{10.5555/2627435.2670313,
	author = {Srivastava, Nitish and Hinton, Geoffrey and Krizhevsky, Alex and Sutskever, Ilya and Salakhutdinov, Ruslan},
	title = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
	year = {2014},
	issue_date = {January 2014},
	publisher = {JMLR.org},
	volume = {15},
	number = {1},
	issn = {1532-4435},
	abstract = {Deep neural nets with a large number of parameters are very powerful machine learning systems. However, overfitting is a serious problem in such networks. Large networks are also slow to use, making it difficult to deal with overfitting by combining the predictions of many different large neural nets at test time. Dropout is a technique for addressing this problem. The key idea is to randomly drop units (along with their connections) from the neural network during training. This prevents units from co-adapting too much. During training, dropout samples from an exponential number of different "thinned" networks. At test time, it is easy to approximate the effect of averaging the predictions of all these thinned networks by simply using a single unthinned network that has smaller weights. This significantly reduces overfitting and gives major improvements over other regularization methods. We show that dropout improves the performance of neural networks on supervised learning tasks in vision, speech recognition, document classification and computational biology, obtaining state-of-the-art results on many benchmark data sets.},
	journal = {J. Mach. Learn. Res.},
	month = jan,
	pages = {1929–1958},
	numpages = {30},
	keywords = {model combination, deep learning, neural networks, regularization}
}


@article{2012-han,
	title={A structure optimisation algorithm for feedforward neural network construction},
	author={Han, Hong-Gui and Qiao, Jun-Fei},
	journal={Neurocomputing},
	volume={99},
	pages={347--357},
	year={2013},
	publisher={Elsevier}
}

@misc{smith2018dont,
	title={Don't Decay the Learning Rate, Increase the Batch Size}, 
	author={Samuel L. Smith and Pieter-Jan Kindermans and Chris Ying and Quoc V. Le},
	year={2018},
	eprint={1711.00489},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@article{2019-lym,
	title={PruneTrain},
	ISBN={9781450362290},
	url={http://dx.doi.org/10.1145/3295500.3356156},
	DOI={10.1145/3295500.3356156},
	journal={Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
	publisher={ACM},
	author={Lym, Sangkug and Choukse, Esha and Zangeneh, Siavash and Wen, Wei and Sanghavi, Sujay and Erez, Mattan},
	year={2019},
	month={Nov}
}


@article{ista,
	author = {Beck, Amir and Teboulle, Marc},
	title = {A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse Problems},
	year = {2009},
	issue_date = {January 2009},
	publisher = {Society for Industrial and Applied Mathematics},
	address = {USA},
	volume = {2},
	number = {1},
	url = {https://doi.org/10.1137/080716542},
	doi = {10.1137/080716542},
	abstract = {We consider the class of iterative shrinkage-thresholding algorithms (ISTA) for solving linear inverse problems arising in signal/image processing. This class of methods, which can be viewed as an extension of the classical gradient algorithm, is attractive due to its simplicity and thus is adequate for solving large-scale problems even with dense matrix data. However, such methods are also known to converge quite slowly. In this paper we present a new fast iterative shrinkage-thresholding algorithm (FISTA) which preserves the computational simplicity of ISTA but with a global rate of convergence which is proven to be significantly better, both theoretically and practically. Initial promising numerical results for wavelet-based image deblurring demonstrate the capabilities of FISTA which is shown to be faster than ISTA by several orders of magnitude.},
	journal = {SIAM J. Img. Sci.},
	month = mar,
	pages = {183–202},
	numpages = {20},
	keywords = {image deblurring, least squares and $l_1$ regularization problems, iterative shrinkage-thresholding algorithm, linear inverse problem, deconvolution, global rate of convergence, optimal gradient method, two-step iterative algorithms}
}

@misc{2013-bengio,
	title={Estimating or Propagating Gradients Through Stochastic Neurons for Conditional Computation}, 
	author={Yoshua Bengio and Nicholas Léonard and Aaron Courville},
	year={2013},
	eprint={1308.3432},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}


@misc{yin2019understanding,
	title={Understanding Straight-Through Estimator in Training Activation Quantized Neural Nets}, 
	author={Penghang Yin and Jiancheng Lyu and Shuai Zhang and Stanley Osher and Yingyong Qi and Jack Xin},
	year={2019},
	eprint={1903.05662},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{almahairi2016dynamic,
	title={Dynamic Capacity Networks}, 
	author={Amjad Almahairi and Nicolas Ballas and Tim Cooijmans and Yin Zheng and Hugo Larochelle and Aaron Courville},
	year={2016},
	eprint={1511.07838},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2017-louizos-bayes,
	title={Bayesian Compression for Deep Learning}, 
	author={Christos Louizos and Karen Ullrich and Max Welling},
	year={2017},
	eprint={1705.08665},
	archivePrefix={arXiv},
	primaryClass={stat.ML}
}

@misc{bengio2016conditional,
	title={Conditional Computation in Neural Networks for faster models}, 
	author={Emmanuel Bengio and Pierre-Luc Bacon and Joelle Pineau and Doina Precup},
	year={2016},
	eprint={1511.06297},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2017-shazeer,
	title={Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer}, 
	author={Noam Shazeer and Azalia Mirhoseini and Krzysztof Maziarz and Andy Davis and Quoc Le and Geoffrey Hinton and Jeff Dean},
	year={2017},
	eprint={1701.06538},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{makhzani2015winnertakeall,
	title={Winner-Take-All Autoencoders}, 
	author={Alireza Makhzani and Brendan Frey},
	year={2015},
	eprint={1409.2752},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}
@misc{ahmad2019dense,
	title={How Can We Be So Dense? The Benefits of Using Highly Sparse Representations}, 
	author={Subutai Ahmad and Luiz Scheinkman},
	year={2019},
	eprint={1903.11257},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{domingos2020model,
	title={Every Model Learned by Gradient Descent Is Approximately a Kernel Machine}, 
	author={Pedro Domingos},
	year={2020},
	eprint={2012.00152},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@ARTICLE{1990-karnin,
	author={E. D. {Karnin}},
	journal={IEEE Transactions on Neural Networks}, 
	title={A simple procedure for pruning back-propagation trained neural networks}, 
	year={1990},
	volume={1},
	number={2},
	pages={239-242},
	doi={10.1109/72.80236}}

@inproceedings{1995-engelbrecht,
	title={Determining the significance of input parameters using sensitivity analysis},
	author={Engelbrecht, Andries Petrus and Cloete, Ian and Zurada, Jacek M},
	booktitle={International Workshop on Artificial Neural Networks},
	pages={382--388},
	year={1995},
	organization={Springer}
}

@article {Kerr14063,
	author = {Kerr, Jason N. D. and Greenberg, David and Helmchen, Fritjof},
	title = {Imaging input and output of neocortical networks in vivo},
	volume = {102},
	number = {39},
	pages = {14063--14068},
	year = {2005},
	doi = {10.1073/pnas.0506029102},
	publisher = {National Academy of Sciences},
	abstract = {Neural activity manifests itself as complex spatiotemporal activation patterns in cell populations. Even for local neural circuits, a comprehensive description of network activity has been impossible so far. Here we demonstrate that two-photon calcium imaging of bulk-labeled tissue permits dissection of local input and output activities in rat neocortex in vivo. Besides astroglial and neuronal calcium transients, we found spontaneous calcium signals in the neuropil that were tightly correlated to the electrocorticogram. This optical encephalogram (OEG) is shown to represent bulk calcium signals in axonal structures, thus providing a measure of local input activity. Simultaneously, output activity in local neuronal populations could be derived from action potential-evoked calcium transients with single-spike resolution. By using these OEG and spike activity measures, we characterized spontaneous activity during cortical Up states. We found that (i) spiking activity is sparse (\&lt;0.1 Hz); (ii) on average, only ≈10\% of neurons are active during each Up state; (iii) this active subpopulation constantly changes with time; and (iv) spiking activity across the population is evenly distributed throughout the Up-state duration. Furthermore, the number of active neurons directly depended on the amplitude of the OEG, thus optically revealing an input-output function for the local network. We conclude that spontaneous activity in the neocortex is sparse and heterogeneously distributed in space and time across the neuronal population. The dissection of the various signal components in bulk-loaded tissue as demonstrated here will enable further studies of signal flow through cortical networks.},
	issn = {0027-8424},
	URL = {https://www.pnas.org/content/102/39/14063},
	eprint = {https://www.pnas.org/content/102/39/14063.full.pdf},
	journal = {Proceedings of the National Academy of Sciences}
}

@misc{2020-su,
	title={Sanity-Checking Pruning Methods: Random Tickets can Win the Jackpot}, 
	author={Jingtong Su and Yihang Chen and Tianle Cai and Tianhao Wu and Ruiqi Gao and Liwei Wang and Jason D. Lee},
	year={2020},
	eprint={2009.11094},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-guo,
	title={Parameter-Efficient Transfer Learning with Diff Pruning}, 
	author={Demi Guo and Alexander M. Rush and Yoon Kim},
	year={2020},
	eprint={2012.07463},
	archivePrefix={arXiv},
	primaryClass={cs.CL}
}

@article{2020-jayakumar,
	title={Top-KAST: Top-K Always Sparse Training},
	author={Jayakumar, Siddhant and Pascanu, Razvan and Rae, Jack and Osindero, Simon and Elsen, Erich},
	journal={Advances in Neural Information Processing Systems},
	volume={33},
	year={2020}
}


@article{2020-chen-rl,
	title={Storage Efficient and Dynamic Flexible Runtime Channel Pruning via Deep Reinforcement Learning}, 
	author={Jianda Chen and Shangyu Chen and Sinno Jialin Pan},
	journal={Advances in Neural Information Processing Systems},
volume={33},
year={2020}
}


@misc{rosenbaum2017routing,
	title={Routing Networks: Adaptive Selection of Non-linear Functions for Multi-Task Learning}, 
	author={Clemens Rosenbaum and Tim Klinger and Matthew Riemer},
	year={2017},
	eprint={1711.01239},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{liu2018dynamic,
	title={Dynamic Deep Neural Networks: Optimizing Accuracy-Efficiency Trade-offs by Selective Execution}, 
	author={Lanlan Liu and Jia Deng},
	year={2018},
	eprint={1701.00299},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-chao,
	title={Directional Pruning of Deep Neural Networks}, 
	author={Shih-Kang Chao and Zhanyu Wang and Yue Xing and Guang Cheng},
	year={2020},
	eprint={2006.09358},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}


@article{2020-zhuang,
	title={Neuron-level Structured Pruning using Polarization Regularizer},
	author={Tao Zhuang and Zhixuan Zhang and Yuheng Huang and Xiaoyi Zeng and  Kai Shuang and Xiang Li},
	journal={Advances in Neural Information Processing Systems},
	volume={33},
	year={2020}
}

@INPROCEEDINGS{1996-engelbrecht,
	author={A. P. {Engelbrecht} and I. {Cloete}},
	booktitle={Proceedings of International Conference on Neural Networks (ICNN'96)}, 
	title={A sensitivity analysis algorithm for pruning feedforward neural networks}, 
	year={1996},
	volume={2},
	number={},
	pages={1274-1278 vol.2},
	doi={10.1109/ICNN.1996.549081}}

@inbook{1990-lecun,
	author = {Le Cun, Yann and Denker, John S. and Solla, Sara A.},
	title = {Optimal Brain Damage},
	year = {1990},
	isbn = {1558601007},
	publisher = {Morgan Kaufmann Publishers Inc.},
	address = {San Francisco, CA, USA},
	booktitle = {Advances in Neural Information Processing Systems 2},
	pages = {598–605},
	numpages = {8}
}
@inproceedings{1992-hassibi,
	author = {Hassibi, Babak and Stork, David G.},
	title = {Second Order Derivatives for Network Pruning: Optimal Brain Surgeon},
	year = {1992},
	isbn = {1558602747},
	publisher = {Morgan Kaufmann Publishers Inc.},
	address = {San Francisco, CA, USA},
	booktitle = {Advances in Neural Information Processing Systems 5, [NIPS Conference]},
	pages = {164–171},
	numpages = {8}
}

@article{1998-amari,
	author = {Amari, Shun-ichi},
	title = {Natural Gradient Works Efficiently in Learning},
	journal = {Neural Computation},
	volume = {10},
	number = {2},
	pages = {251-276},
	year = {1998},
	doi = {10.1162/089976698300017746},
	
	URL = { 
	https://doi.org/10.1162/089976698300017746
	
	},
	eprint = { 
	https://doi.org/10.1162/089976698300017746
	
	}
}

@inproceedings{2017-wu,
	author={Yuhuai Wu and Elman Mansimov and Roger B. Grosse and Shun Liao and Jimmy Ba},
	title={Second-order Optimization for Deep Reinforcement Learning using Kronecker-factored Approximation},
	year={2017},
	cdate={1483228800000},
	pages={5285-5294},
	url={http://papers.nips.cc/paper/7112-second-order-optimization-for-deep-reinforcement-learning-using-kronecker-factored-approximation},
	booktitle={NIPS},
}

@INPROCEEDINGS{1995-thimm,
	author = {Georg Thimm and Emile Fiesler},
	title = {Evaluating Pruning Methods},
	booktitle = {National Chiao-Tung University},
	year = {1995},
	pages = {2}
}


@article{2019-osawa,
	title={Large-Scale Distributed Second-Order Optimization Using Kronecker-Factored Approximate Curvature for Deep Convolutional Neural Networks},
	ISBN={9781728132938},
	url={http://dx.doi.org/10.1109/CVPR.2019.01264},
	DOI={10.1109/cvpr.2019.01264},
	journal={2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
	publisher={IEEE},
	author={Osawa, Kazuki and Tsuji, Yohei and Ueno, Yuichiro and Naruse, Akira and Yokota, Rio and Matsuoka, Satoshi},
	year={2019},
	month={Jun}
}

@misc{2020-renda,
	title={Comparing Rewinding and Fine-tuning in Neural Network Pruning}, 
	author={Alex Renda and Jonathan Frankle and Michael Carbin},
	year={2020},
	eprint={2003.02389},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@article{2007-yuan,
	title={Model selection and estimation in regression with grouped variables},
	author={Yuan, Ming and Lin, Yi},
	journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
	volume={68},
	number={1},
	pages={49--67},
	year={2006},
	publisher={Wiley Online Library}
}

@inproceedings{2017-lin,
	author = {Lin, Ji and Rao, Yongming and Lu, Jiwen and Zhou, Jie},
	booktitle = {Advances in Neural Information Processing Systems},
	editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
	pages = {2181--2191},
	publisher = {Curran Associates, Inc.},
	title = {Runtime Neural Pruning},
	url = {https://proceedings.neurips.cc/paper/2017/file/a51fb975227d6640e4fe47854476d133-Paper.pdf},
	volume = {30},
	year = {2017}
}


@misc{2017-alvarez,
	title={Compression-aware Training of Deep Networks}, 
	author={Jose M. Alvarez and Mathieu Salzmann},
	year={2017},
	eprint={1711.02638},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{sparse_group_lasso,
	title={A note on the group lasso and a sparse group lasso}, 
	author={J. Friedman and T. Hastie and R. Tibshirani},
	year={2010},
	eprint={1001.0736},
	archivePrefix={arXiv},
	primaryClass={math.ST}
}

@misc{2016-pan,
	title={DropNeuron: Simplifying the Structure of Deep Neural Networks}, 
	author={Wei Pan and Hao Dong and Yike Guo},
	year={2016},
	eprint={1606.07326},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2018-suau,
	title={Filter Distillation for Network Compression}, 
	author={Xavier Suau and Luca Zappella and Nicholas Apostoloff},
	year={2019},
	eprint={1807.10585},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2018-huang,
	title={Data-Driven Sparse Structure Selection for Deep Neural Networks}, 
	author={Zehao Huang and Naiyan Wang},
	year={2018},
	eprint={1707.01213},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@INPROCEEDINGS{2018-carreira,
	author={M. A. {Carreira-Perpinan} and Y. {Idelbayev}},
	booktitle={2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 
	title={"Learning-Compression" Algorithms for Neural Net Pruning}, 
	year={2018},
	volume={},
	number={},
	pages={8532-8541},
	doi={10.1109/CVPR.2018.00890}}

@misc{2019-mlprune,
	title={{MLP}rune: Multi-Layer Pruning for Automated Neural Network Compression},
	author={Wenyuan Zeng and Raquel Urtasun},
	year={2019},
	url={https://openreview.net/forum?id=r1g5b2RcKm},
}

@article{2016-ba,
	title={Distributed second-order optimization using Kronecker-factored approximations},
	author={Ba, Jimmy and Grosse, Roger and Martens, James},
	year={2016}
}

@misc{2020-raihan,
	title={Sparse Weight Activation Training}, 
	author={Md Aamir Raihan and Tor M. Aamodt},
	year={2020},
	eprint={2001.01969},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-ye,
	title={Good Subnetworks Provably Exist: Pruning via Greedy Forward Selection}, 
	author={Mao Ye and Chengyue Gong and Lizhen Nie and Denny Zhou and Adam Klivans and Qiang Liu},
	year={2020},
	eprint={2003.01794},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2019-zhuang,
	title={Discrimination-aware Channel Pruning for Deep Neural Networks}, 
	author={Zhuangwei Zhuang and Mingkui Tan and Bohan Zhuang and Jing Liu and Yong Guo and Qingyao Wu and Junzhou Huang and Jinhui Zhu},
	year={2019},
	eprint={1810.11809},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2020-madaan,
	title={Adversarial Neural Pruning with Latent Vulnerability Suppression}, 
	author={Divyam Madaan and Jinwoo Shin and Sung Ju Hwang},
	year={2020},
	eprint={1908.04355},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2019-abdellatif,
	title={Convergence Rates of Variational Inference in Sparse Deep Learning}, 
	author={Badr-Eddine Chérief-Abdellatif},
	year={2019},
	eprint={1908.04847},
	archivePrefix={arXiv},
	primaryClass={math.ST}
}

@misc{2020-liu,
	title={Finding trainable sparse networks through Neural Tangent Transfer}, 
	author={Tianlin Liu and Friedemann Zenke},
	year={2020},
	eprint={2006.08228},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2019-wang,
  title={Eigendamage: Structured pruning in the kronecker-factored eigenbasis},
  author={Wang, Chaoqi and Grosse, Roger and Fidler, Sanja and Zhang, Guodong},
  year={2019},
  eprint={1905.05934},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@misc{2020-pensia,
	title={Optimal Lottery Tickets via SubsetSum: Logarithmic Over-Parameterization is Sufficient}, 
	author={Ankit Pensia and Shashank Rajput and Alliot Nagle and Harit Vishwakarma and Dimitris Papailiopoulos},
	year={2020},
	eprint={2006.07990},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-li-sac,
	title={SAC: Accelerating and Structuring Self-Attention via Sparse Adaptive Connection}, 
	author={Xiaoya Li and Yuxian Meng and Mingxin Zhou and Qinghong Han and Fei Wu and Jiwei Li},
	year={2020},
	eprint={2003.09833},
	archivePrefix={arXiv},
	primaryClass={cs.CL}
}

@misc{2016-hu,
	title={Network Trimming: A Data-Driven Neuron Pruning Approach towards Efficient Deep Architectures}, 
	author={Hengyuan Hu and Rui Peng and Yu-Wing Tai and Chi-Keung Tang},
	year={2016},
	eprint={1607.03250},
	archivePrefix={arXiv},
	primaryClass={cs.NE}
}

@misc{2018-dai,
	title={Compressing Neural Networks using the Variational Information Bottleneck}, 
	author={Bin Dai and Chen Zhu and David Wipf},
	year={2018},
	eprint={1802.10399},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{wei2017minimal,
	title={Minimal Effort Back Propagation for Convolutional Neural Networks}, 
	author={Bingzhen Wei and Xu Sun and Xuancheng Ren and Jingjing Xu},
	year={2017},
	eprint={1709.05804},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-mussay,
	title={Data-Independent Structured Pruning of Neural Networks via Coresets}, 
	author={Ben Mussay and Daniel Feldman and Samson Zhou and Vladimir Braverman and Margarita Osadchy},
	year={2020},
	eprint={2008.08316},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2019-lee-init,
	title={A Signal Propagation Perspective for Pruning Neural Networks at Initialization}, 
	author={Namhoon Lee and Thalaiyasingam Ajanthan and Stephen Gould and Philip H. S. Torr},
	year={2020},
	eprint={1906.06307},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{bennun2018demystifying,
	title={Demystifying Parallel and Distributed Deep Learning: An In-Depth Concurrency Analysis}, 
	author={Tal Ben-Nun and Torsten Hoefler},
	year={2018},
	eprint={1802.09941},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{lepikhin2020gshard,
	title={{GShard}: Scaling Giant Models with Conditional Computation and Automatic Sharding}, 
	author={Dmitry Lepikhin and HyoukJoong Lee and Yuanzhong Xu and Dehao Chen and Orhan Firat and Yanping Huang and Maxim Krikun and Noam Shazeer and Zhifeng Chen},
	year={2020},
	eprint={2006.16668},
	archivePrefix={arXiv},
	primaryClass={cs.CL}
}

@inproceedings{benchmarking,
	author={Torsten Hoefler and Roberto Belli},
	title={{Scientific Benchmarking of Parallel Computing Systems}},
	year={2015},
	month={Nov.},
	pages={73:1--73:12},
	location={Austin, TX, USA},
	publisher={ACM},
	isbn={978-1-4503-3723-6},
	note={Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC15)},
	source={http://www.unixer.de/~htor/publications/},
}

@misc{millidge2020predictive,
	title={Predictive Coding Approximates Backprop along Arbitrary Computation Graphs}, 
	author={Beren Millidge and Alexander Tschantz and Christopher L. Buckley},
	year={2020},
	eprint={2006.04182},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2019-dai,
	title={NeST: A Neural Network Synthesis Tool Based on a Grow-and-Prune Paradigm}, 
	author={Xiaoliang Dai and Hongxu Yin and Niraj K. Jha},
	year={2018},
	eprint={1711.02017},
	archivePrefix={arXiv},
	primaryClass={cs.NE}
}

@misc{2017-zhu,
	title={To prune, or not to prune: exploring the efficacy of pruning for model compression}, 
	author={Michael Zhu and Suyog Gupta},
	year={2017},
	eprint={1710.01878},
	archivePrefix={arXiv},
	primaryClass={stat.ML}
}

@misc{2015-han,
	title={Deep Compression: Compressing Deep Neural Networks with Pruning, Trained Quantization and Huffman Coding}, 
	author={Song Han and Huizi Mao and William J. Dally},
	year={2016},
	eprint={1510.00149},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@article{betzel2017modular,
	title={The modular organization of human anatomical brain networks: Accounting for the cost of wiring},
	author={Betzel, Richard F and Medaglia, John D and Papadopoulos, Lia and Baum, Graham L and Gur, Ruben and Gur, Raquel and Roalf, David and Satterthwaite, Theodore D and Bassett, Danielle S},
	journal={Network Neuroscience},
	volume={1},
	number={1},
	pages={42--68},
	year={2017},
	publisher={MIT Press}
}

@misc{2017-li,
	title={Pruning Filters for Efficient ConvNets}, 
	author={Hao Li and Asim Kadav and Igor Durdanovic and Hanan Samet and Hans Peter Graf},
	year={2017},
	eprint={1608.08710},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@article{2018-mocanu,
	title={Scalable training of artificial neural networks with adaptive sparse connectivity inspired by network science},
	author={Mocanu, Decebal Constantin and Mocanu, Elena and Stone, Peter and Nguyen, Phuong H and Gibescu, Madeleine and Liotta, Antonio},
	journal={Nature communications},
	volume={9},
	number={1},
	pages={1--12},
	year={2018},
	publisher={Nature Publishing Group}
}

@misc{2019-gomez,
	title={Learning Sparse Networks Using Targeted Dropout}, 
	author={Aidan N. Gomez and Ivan Zhang and Siddhartha Rao Kamalakara and Divyam Madaan and Kevin Swersky and Yarin Gal and Geoffrey E. Hinton},
	year={2019},
	eprint={1905.13678},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@InProceedings{2020-tan-drop, title = {{D}rop{N}et: Reducing Neural Network Complexity via Iterative Pruning}, author = {Tan, Chong Min John and Motani, Mehul}, booktitle = {Proceedings of the 37th International Conference on Machine Learning}, pages = {9356--9366}, year = {2020}, editor = {Hal Daumé III and Aarti Singh}, volume = {119}, series = {Proceedings of Machine Learning Research}, month = {13--18 Jul}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v119/tan20a/tan20a.pdf}, url = { http://proceedings.mlr.press/v119/tan20a.html }, abstract = {Modern deep neural networks require a significant amount of computing time and power to train and deploy, which limits their usage on edge devices. Inspired by the iterative weight pruning in the Lottery Ticket Hypothesis, we propose DropNet, an iterative pruning method which prunes nodes/filters to reduce network complexity. DropNet iteratively removes nodes/filters with the lowest average post-activation value across all training samples. Empirically, we show that DropNet is robust across a wide range of scenarios, including MLPs and CNNs using the MNIST, CIFAR-10 and Tiny ImageNet datasets. We show that up to 90% of the nodes/filters can be removed without any significant loss of accuracy. The final pruned network performs well even with reinitialisation of the weights and biases. DropNet also achieves similar accuracy to an oracle which greedily removes nodes/filters one at a time to minimise training loss, highlighting its effectiveness.} }

@misc{2020-meng,
	title={Pruning Filter in Filter}, 
	author={Fanxu Meng and Hao Cheng and Ke Li and Huixiang Luo and Xiaowei Guo and Guangming Lu and Xing Sun},
	year={2020},
	eprint={2009.14410},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2018-bellec,
	title={Deep Rewiring: Training very sparse deep networks}, 
	author={Guillaume Bellec and David Kappel and Wolfgang Maass and Robert Legenstein},
	year={2018},
	eprint={1711.05136},
	archivePrefix={arXiv},
	primaryClass={cs.NE}
}

@misc{2019-dettmers,
	title={Sparse Networks from Scratch: Faster Training without Losing Performance}, 
	author={Tim Dettmers and Luke Zettlemoyer},
	year={2019},
	eprint={1907.04840},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2019-mostafa,
	title={Parameter Efficient Training of Deep Convolutional Neural Networks by Dynamic Sparse Reparameterization}, 
	author={Hesham Mostafa and Xin Wang},
	year={2019},
	eprint={1902.05967},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2018-liu,
	title={Rethinking the Value of Network Pruning}, 
	author={Zhuang Liu and Mingjie Sun and Tinghui Zhou and Gao Huang and Trevor Darrell},
	year=2019,
	eprint={1810.05270},
	archivePrefix={arXiv},
	eprintclass={cs.LG}
}

@misc{2020-evci-difficult,
	title={The Difficulty of Training Sparse Neural Networks}, 
	author={Utku Evci and Fabian Pedregosa and Aidan Gomez and Erich Elsen},
	year={2020},
	eprint={1906.10732},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}


@misc{iandola2016squeezenet,
	title={SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size}, 
	author={Forrest N. Iandola and Song Han and Matthew W. Moskewicz and Khalid Ashraf and William J. Dally and Kurt Keutzer},
	year={2016},
	eprint={1602.07360},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}
@inproceedings{hubara-bin,
	author = {Hubara, Itay and Courbariaux, Matthieu and Soudry, Daniel and El-Yaniv, Ran and Bengio, Yoshua},
	title = {Binarized Neural Networks},
	year = {2016},
	isbn = {9781510838819},
	publisher = {Curran Associates Inc.},
	address = {Red Hook, NY, USA},
	abstract = {We introduce a method to train Binarized Neural Networks (BNNs) - neural networks with binary weights and activations at run-time. At train-time the binary weights and activations are used for computing the parameter gradients. During the forward pass, BNNs drastically reduce memory size and accesses, and replace most arithmetic operations with bit-wise operations, which is expected to substantially improve power-efficiency. To validate the effectiveness of BNNs, we conducted two sets of experiments on the Torch7 and Theano frameworks. On both, BNNs achieved nearly state-of-the-art results over the MNIST, CIFAR-10 and SVHN datasets. We also report our preliminary results on the challenging ImageNet dataset. Last but not least, we wrote a binary matrix multiplication GPU kernel with which it is possible to run our MNIST BNN 7 times faster than with an unoptimized GPU kernel, without suffering any loss in classification accuracy. The code for training and running our BNNs is available on-line.},
	booktitle = {Proceedings of the 30th International Conference on Neural Information Processing Systems},
	pages = {4114–4122},
	numpages = {9},
	location = {Barcelona, Spain},
	series = {NIPS'16}
}

@misc{2019-mccarley,
	title={Structured Pruning of a {BERT}-based Question Answering Model}, 
	author={J. S. McCarley and Rishav Chakravarti and Avirup Sil},
	year={2020},
	eprint={1910.06360},
	archivePrefix={arXiv},
	primaryClass={cs.CL}
}

@article{kwiatkowski2019natural,
	title={Natural questions: a benchmark for question answering research},
	author={Kwiatkowski, Tom and Palomaki, Jennimaria and Redfield, Olivia and Collins, Michael and Parikh, Ankur and Alberti, Chris and Epstein, Danielle and Polosukhin, Illia and Devlin, Jacob and Lee, Kenton and others},
	journal={Transactions of the Association for Computational Linguistics},
	volume={7},
	pages={453--466},
	year={2019},
	publisher={MIT Press}
}

@misc{2020-elsen,
	title={Fast Sparse ConvNets}, 
	author={Erich Elsen and Marat Dukhan and Trevor Gale and Karen Simonyan},
	year={2019},
	eprint={1911.09723},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{howard2017mobilenets,
	title={MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications}, 
	author={Andrew G. Howard and Menglong Zhu and Bo Chen and Dmitry Kalenichenko and Weijun Wang and Tobias Weyand and Marco Andreetto and Hartwig Adam},
	year={2017},
	eprint={1704.04861},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2020-lee,
	title={Understanding the Effects of Data Parallelism and Sparsity on Neural Network Training}, 
	author={Namhoon Lee and Thalaiyasingam Ajanthan and Philip H. S. Torr and Martin Jaggi},
	year={2020},
	eprint={2003.11316},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2019-wortsman,
	title={Discovering Neural Wirings}, 
	author={Mitchell Wortsman and Ali Farhadi and Mohammad Rastegari},
	year={2019},
	eprint={1906.00586},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@ARTICLE{2017-hawkins,
	author={J. {Hawkins}},
	journal={IEEE Spectrum}, 
	title={Special report : Can we copy the brain? - What intelligent machines need to learn from the Neocortex}, 
	year={2017},
	volume={54},
	number={6},
	pages={34-71},
	doi={10.1109/MSPEC.2017.7934229}}

@misc{2020-you,
	title={Drawing early-bird tickets: Towards more efficient training of deep networks}, 
	author={Haoran You and Chaojian Li and Pengfei Xu and Yonggan Fu and Yue Wang and Xiaohan Chen and Richard G. Baraniuk and Zhangyang Wang and Yingyan Lin},
	year={2020},
	eprint={1909.11957},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2016-cohen,
	title={RandomOut: Using a convolutional gradient norm to rescue convolutional filters}, 
	author={Joseph Paul Cohen and Henry Z. Lo and Wei Ding},
	year={2017},
	eprint={1602.05931},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{plummer2020shapeshifter,
	title={Shapeshifter Networks: Cross-layer Parameter Sharing for Scalable and Effective Deep Learning}, 
	author={Bryan A. Plummer and Nikoli Dryden and Julius Frost and Torsten Hoefler and Kate Saenko},
	year={2020},
	eprint={2006.10598},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}


@inproceedings{2015-shokri,
  title={Privacy-preserving deep learning},
  author={Shokri, Reza and Shmatikov, Vitaly},
  booktitle={Proceedings of the 22nd ACM SIGSAC conference on computer and communications security},
  pages={1310--1321},
  year={2015}
}

@misc{2020-davies,
      title={Distributed Variance Reduction with Optimal Communication}, 
      author={Peter Davies and Vijaykrishna Gurunathan and Niusha Moshrefi and Saleh Ashkboos and Dan Alistarh},
      year={2020},
      eprint={2002.09268},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{2020-dascoli,
	title={Finding the Needle in the Haystack with Convolutions: on the benefits of architectural bias}, 
	author={Stéphane d'Ascoli and Levent Sagun and Joan Bruna and Giulio Biroli},
	year={2020},
	eprint={1906.06766},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-neyshabur,
	title={Towards Learning Convolutions from Scratch}, 
	author={Behnam Neyshabur},
	year={2020},
	eprint={2007.13657},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{alistarh2017qsgd,
	title={QSGD: Communication-Efficient SGD via Gradient Quantization and Encoding}, 
	author={Dan Alistarh and Demjan Grubic and Jerry Li and Ryota Tomioka and Milan Vojnovic},
	year={2017},
	eprint={1610.02132},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{zhao2017learning,
	title={Learning Efficient Tensor Representations with Ring Structure Networks}, 
	author={Qibin Zhao and Masashi Sugiyama and Andrzej Cichocki},
	year={2017},
	eprint={1705.08286},
	archivePrefix={arXiv},
	primaryClass={cs.NA}
}

@misc{2020-evci-gradient-flow,
	title={Gradient Flow in Sparse Neural Networks and How Lottery Tickets Win}, 
	author={Utku Evci and Yani A. Ioannou and Cem Keskin and Yann Dauphin},
	year={2020},
	eprint={2010.03533},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@INPROCEEDINGS{6638949,
	
	author={T. N. {Sainath} and B. {Kingsbury} and V. {Sindhwani} and E. {Arisoy} and B. {Ramabhadran}},
	
	booktitle={2013 IEEE International Conference on Acoustics, Speech and Signal Processing}, 
	
	title={Low-rank matrix factorization for Deep Neural Network training with high-dimensional output targets}, 
	
	year={2013},
	
	volume={},
	
	number={},
	
	pages={6655-6659},
	
	doi={10.1109/ICASSP.2013.6638949}}


@misc{2020-wortsman,
	title={Supermasks in Superposition}, 
	author={Mitchell Wortsman and Vivek Ramanujan and Rosanne Liu and Aniruddha Kembhavi and Mohammad Rastegari and Jason Yosinski and Ali Farhadi},
	year={2020},
	eprint={2006.14769},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{hinton2015distilling,
	title={Distilling the Knowledge in a Neural Network}, 
	author={Geoffrey Hinton and Oriol Vinyals and Jeff Dean},
	year={2015},
	eprint={1503.02531},
	archivePrefix={arXiv},
	primaryClass={stat.ML}
}

@misc{2020-evci,
	title={Rigging the Lottery: Making All Tickets Winners}, 
	author={Utku Evci and Trevor Gale and Jacob Menick and Pablo Samuel Castro and Erich Elsen},
	year={2020},
	eprint={1911.11134},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}


@misc{2018-louizos,
	title={Learning Sparse Neural Networks through $L_0$ Regularization}, 
	author={Christos Louizos and Max Welling and Diederik P. Kingma},
	year={2018},
	eprint={1712.01312},
	archivePrefix={arXiv},
	primaryClass={stat.ML}
}

@inbook{1988-chauvin,
	author = {Chauvin, Yves},
	title = {A Back-Propagation Algorithm with Optimal Use of Hidden Units},
	year = {1989},
	isbn = {1558600159},
	publisher = {Morgan Kaufmann Publishers Inc.},
	address = {San Francisco, CA, USA},
	booktitle = {Advances in Neural Information Processing Systems 1},
	pages = {519–526},
	numpages = {8}
}

@ARTICLE{1995-williams,
	author={P. M. {Williams}},
	journal={Neural Computation}, 
	title={Bayesian Regularization and Pruning Using a Laplace Prior}, 
	year={1995},
	volume={7},
	number={1},
	pages={117-143},
	doi={10.1162/neco.1995.7.1.117}}

@inproceedings{1997-stroem, 
	title={Sparse connection and pruning in large dynamic artificial neural networks},
	author={Str{\"o}m, Nikko},
	booktitle={Fifth European Conference on Speech Communication and Technology},
	year={1997}
}

@misc{2015-sun,
	title={Sparsifying Neural Network Connections for Face Recognition}, 
	author={Yi Sun and Xiaogang Wang and Xiaoou Tang},
	year={2015},
	eprint={1512.01891},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@INPROCEEDINGS{2015-liu,
	author={ Baoyuan Liu and  {Min Wang} and H. {Foroosh} and M. {Tappen} and M. {Penksy}},
	booktitle={2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 
	title={Sparse Convolutional Neural Networks}, 
	year={2015},
	volume={},
	number={},
	pages={806-814},
	doi={10.1109/CVPR.2015.7298681}}

@article{unat-locality,
	author={Didem Unat and Anshu Dubey and Torsten Hoefler and John Shalf and Mark Abraham and Mauro Bianco and Bradford L. Chamberlain and Romain Cledat and H. Carter Edwards and Hal Finkel and Karl Fuerlinger and Frank Hannig and Emmanuel Jeannot and Amir Kamil and Jeff Keasler and Paul H J Kelly and Vitus Leung and Hatem Ltaief and Naoya Maruyama and Chris J. Newburn and and Miquel Pericas},
	title={{Trends in Data Locality Abstractions for HPC Systems}},
	journal={IEEE Transactions on Parallel and Distributed Systems (TPDS)},
	year={2017},
	month={Oct.},
	volume={28},
	number={10},
	publisher={IEEE},
	source={http://www.unixer.de/~htor/publications/},
}


@inproceedings{2020-afghan,
	title={Interval Adjoint Significance Analysis for Neural Networks},
	author={Afghan, Sher and Naumann, Uwe},
	booktitle={International Conference on Computational Science},
	pages={365--378},
	year={2020},
	organization={Springer}
}

@misc{2020-gale,
	title={Sparse GPU Kernels for Deep Learning}, 
	author={Trevor Gale and Matei Zaharia and Cliff Young and Erich Elsen},
	year={2020},
	eprint={2006.10901},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2015-srinivas,
	title={Data-free parameter pruning for Deep Neural Networks}, 
	author={Suraj Srinivas and R. Venkatesh Babu},
	year={2015},
	eprint={1507.06149},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2016-han-eie,
	title={EIE: Efficient Inference Engine on Compressed Deep Neural Network}, 
	author={Song Han and Xingyu Liu and Huizi Mao and Jing Pu and Ardavan Pedram and Mark A. Horowitz and William J. Dally},
	year={2016},
	eprint={1602.01528},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2016-guo,
	title={Dynamic Network Surgery for Efficient DNNs}, 
	author={Yiwen Guo and Anbang Yao and Yurong Chen},
	year={2016},
	eprint={1608.04493},
	archivePrefix={arXiv},
	primaryClass={cs.NE}
}

@misc{2018-mallya,
	title={PackNet: Adding Multiple Tasks to a Single Network by Iterative Pruning}, 
	author={Arun Mallya and Svetlana Lazebnik},
	year={2018},
	eprint={1711.05769},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2017-narang,
	title={Exploring Sparsity in Recurrent Neural Networks}, 
	author={Sharan Narang and Erich Elsen and Gregory Diamos and Shubho Sengupta},
	year={2017},
	eprint={1704.05119},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{kalchbrenner2018efficient,
	title={Efficient Neural Audio Synthesis}, 
	author={Nal Kalchbrenner and Erich Elsen and Karen Simonyan and Seb Noury and Norman Casagrande and Edward Lockhart and Florian Stimberg and Aaron van den Oord and Sander Dieleman and Koray Kavukcuoglu},
	year={2018},
	eprint={1802.08435},
	archivePrefix={arXiv},
	primaryClass={cs.SD}
}

@misc{2019-gaier,
	title={Weight Agnostic Neural Networks}, 
	author={Adam Gaier and David Ha},
	year={2019},
	eprint={1906.04358},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-sanh,
	title={Movement Pruning: Adaptive Sparsity by Fine-Tuning}, 
	author={Victor Sanh and Thomas Wolf and Alexander M. Rush},
	year={2020},
	eprint={2005.07683},
	archivePrefix={arXiv},
	primaryClass={cs.CL}
}

@article{2006-mukherjee,
	title={Learning theory: stability is sufficient for generalization and necessary and sufficient for consistency of empirical risk minimization},
	author={Mukherjee, Sayan and Niyogi, Partha and Poggio, Tomaso and Rifkin, Ryan},
	journal={Advances in Computational Mathematics},
	volume={25},
	number={1-3},
	pages={161--193},
	year={2006},
	publisher={Springer}
}

@misc{ivanov2020data,
	title={Data Movement Is All You Need: A Case Study on Optimizing Transformers}, 
	author={Andrei Ivanov and Nikoli Dryden and Tal Ben-Nun and Shigang Li and Torsten Hoefler},
	year={2020},
	eprint={2007.00072},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@phdthesis{sifre2014rigid,
	author       = {Sifre, Laurent and Mallat, St{\'e}phane}, 
	title        = {Rigid-motion scattering for image classification},
	school       = {Ecole Polytechnique, CMAP},
	year         = 2014,
	month        = 10
}


@misc{2016-molchanov,
	title={Pruning Convolutional Neural Networks for Resource Efficient Inference}, 
	author={Pavlo Molchanov and Stephen Tyree and Tero Karras and Timo Aila and Jan Kautz},
	year={2017},
	eprint={1611.06440},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2017-molchanov,
	title={Variational Dropout Sparsifies Deep Neural Networks}, 
	author={Dmitry Molchanov and Arsenii Ashukha and Dmitry Vetrov},
	year={2017},
	eprint={1701.05369},
	archivePrefix={arXiv},
	primaryClass={stat.ML}
}


@inproceedings{2016-molchanov-ard,
	title={Dropout-based automatic relevance determination},
	author={Molchanov, Dmitry and Ashuha, Arseniy and Vetrov, Dmitry},
	booktitle={Bayesian Deep Learning workshop, NIPS},
	year={2016}
}


@misc{2018-lobacheva,
	title={Bayesian sparsification of gated recurrent neural networks},
	author={Lobacheva, Ekaterina and Chirkova, Nadezhda and Vetrov, Dmitry},
	year={2018},
        eprint={1812.05692},
        archivePrefix={arXiv},
        primaryClass={cs.LG}
}

@inproceedings{2019-kodryan,
	title={Efficient Language Modeling with Automatic Relevance Determination in Recurrent Neural Networks},
	author={Kodryan, Maxim and Grachev, Artem and Ignatov, Dmitry and Vetrov, Dmitry},
	booktitle={Proceedings of the 4th Workshop on Representation Learning for NLP (RepL4NLP-2019)},
	pages={40--48},
	year={2019}
}

@misc{2017-dong,
	title={Learning to Prune Deep Neural Networks via Layer-wise Optimal Brain Surgeon}, 
	author={Xin Dong and Shangyu Chen and Sinno Jialin Pan},
	year={2017},
	eprint={1705.07565},
	archivePrefix={arXiv},
	primaryClass={cs.NE}
}

@misc{2018-tartaglione,
	title={Learning Sparse Neural Networks via Sensitivity-Driven Regularization}, 
	author={Enzo Tartaglione and Skjalg Lepsøy and Attilio Fiandrotti and Gianluca Francini},
	year={2018},
	eprint={1810.11764},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2019-lee,
	title={SNIP: Single-shot Network Pruning based on Connection Sensitivity}, 
	author={Namhoon Lee and Thalaiyasingam Ajanthan and Philip H. S. Torr},
	year={2019},
	eprint={1810.02340},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2019-zheng,
	title={{MLP}rune: Multi-Layer Pruning for Automated Neural Network Compression},
	author={Wenyuan Zeng and Raquel Urtasun},
	year={2019},
	url={https://openreview.net/forum?id=r1g5b2RcKm},
}

@misc{2020-singh,
	title={WoodFisher: Efficient Second-Order Approximation for Neural Network Compression}, 
	author={Sidak Pal Singh and Dan Alistarh},
	year={2020},
	eprint={2004.14340},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{ly2017tutorial,
	title={A Tutorial on Fisher Information}, 
	author={Alexander Ly and Maarten Marsman and Josine Verhagen and Raoul Grasman and Eric-Jan Wagenmakers},
	year={2017},
	eprint={1705.01064},
	archivePrefix={arXiv},
	primaryClass={math.ST}
}

@misc{2018-theis,
	title={Faster gaze prediction with dense networks and Fisher pruning}, 
	author={Lucas Theis and Iryna Korshunova and Alykhan Tejani and Ferenc Huszár},
	year={2018},
	eprint={1801.05787},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@inproceedings{2019-kunstner,
  title={Limitations of the empirical Fisher approximation for natural gradient descent},
  author={Kunstner, Frederik and Hennig, Philipp and Balles, Lukas},
  booktitle={Advances in Neural Information Processing Systems},
  pages={4156--4167},
  year={2019}
}

@misc{2015-martens,
	title={Optimizing Neural Networks with Kronecker-factored Approximate Curvature}, 
	author={James Martens and Roger Grosse},
	year={2015},
	eprint={1503.05671},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@inproceedings{2019-xiao,
	author = {Xiao, Xia and Wang, Zigeng and Rajasekaran, Sanguthevar},
	booktitle = {Advances in Neural Information Processing Systems},
	editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
	pages = {13681--13691},
	publisher = {Curran Associates, Inc.},
	title = {AutoPrune: Automatic Network Pruning by Regularizing Auxiliary Parameters},
	url = {https://proceedings.neurips.cc/paper/2019/file/4efc9e02abdab6b6166251918570a307-Paper.pdf},
	volume = {32},
	year = {2019}
}


@article{ge2011note,
	title={A note on the complexity of L p minimization},
	author={Ge, Dongdong and Jiang, Xiaoye and Ye, Yinyu},
	journal={Mathematical programming},
	volume={129},
	number={2},
	pages={285--299},
	year={2011},
	publisher={Springer}
}

@article{tibshirani1996regression,
	title={Regression shrinkage and selection via the lasso},
	author={Tibshirani, Robert},
	journal={Journal of the Royal Statistical Society: Series B (Methodological)},
	volume={58},
	number={1},
	pages={267--288},
	year={1996},
	publisher={Wiley Online Library}
}

@misc{2016-aghasi,
	title={Net-Trim: Convex Pruning of Deep Neural Networks with Performance Guarantee}, 
	author={Alireza Aghasi and Afshin Abdi and Nam Nguyen and Justin Romberg},
	year={2017},
	eprint={1611.05162},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@book{hebb-organization-of-behavior-1949,
	address = {New York},
	author = {Hebb, Donald O.},
	biburl = {https://www.bibsonomy.org/bibtex/26432ae617e6db0127c8b197bf760d99e/mhwombat},
	citeulike-article-id = {500649},
	citeulike-linkout-0 = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0805843000},
	citeulike-linkout-1 = {http://www.amazon.de/exec/obidos/redirect?tag=citeulike01-21\&amp;path=ASIN/0805843000},
	citeulike-linkout-2 = {http://www.amazon.fr/exec/obidos/redirect?tag=citeulike06-21\&amp;path=ASIN/0805843000},
	citeulike-linkout-3 = {http://www.amazon.jp/exec/obidos/ASIN/0805843000},
	citeulike-linkout-4 = {http://www.amazon.co.uk/exec/obidos/ASIN/0805843000/citeulike00-21},
	citeulike-linkout-5 = {http://www.amazon.com/exec/obidos/redirect?tag=citeulike07-20\&path=ASIN/0805843000},
	citeulike-linkout-6 = {http://www.worldcat.org/isbn/0805843000},
	citeulike-linkout-7 = {http://books.google.com/books?vid=ISBN0805843000},
	citeulike-linkout-8 = {http://www.amazon.com/gp/search?keywords=0805843000\&index=books\&linkCode=qs},
	citeulike-linkout-9 = {http://www.librarything.com/isbn/0805843000},
	day = 15,
	file = {:neural_nets/Hebb 1949.pdf:PDF},
	groups = {public},
	howpublished = {Hardcover},
	interhash = {ba8f8b92a0de2c83bdbcc9d742235a59},
	intrahash = {6432ae617e6db0127c8b197bf760d99e},
	isbn = {0-8058-4300-0},
	keywords = {MSc checked network neural seminal},
	month = jun,
	posted-at = {2006-02-10 16:35:34},
	priority = {2},
	publisher = {Wiley},
	timestamp = {2016-07-12T19:25:30.000+0200},
	title = {The organization of behavior: {A} neuropsychological
	theory},
	username = {mhwombat},
	year = 1949
}

@article{1992-finnoff,
	title={Improving model selection by nonconvergent methods},
	author={Finnoff, William and Hergert, Ferdinand and Zimmermann, Hans Georg},
	journal={Neural Networks},
	volume={6},
	number={6},
	pages={771--783},
	year={1993},
	publisher={Elsevier}
}

@inproceedings{gpt-3,
  title={Language models are few-shot learners},
  author={Brown, Tom B and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
  booktitle={Advances in Neural Information Processing Systems},
  year={2020},
  eprint={2005.14165},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@ARTICLE{9043731,
	author={L. {Deng} and G. {Li} and S. {Han} and L. {Shi} and Y. {Xie}},
	journal={Proceedings of the IEEE}, 
	title={Model Compression and Hardware Acceleration for Neural Networks: A Comprehensive Survey}, 
	year={2020},
	volume={108},
	number={4},
	pages={485-532},
	doi={10.1109/JPROC.2020.2976475}}

@article{choudhary2020comprehensive,
	title={A comprehensive survey on model compression and acceleration},
	author={Choudhary, Tejalal and Mishra, Vipul and Goswami, Anurag and Sarangapani, Jagannathan},
	journal={Artificial Intelligence Review},
	pages={1--43},
	year={2020},
	publisher={Springer}
}

@article{2017-mishra,
  author    = {Asit K. Mishra and
               Eriko Nurvitadhi and
               Jeffrey J. Cook and
               Debbie Marr},
  title     = {{WRPN:} Wide Reduced-Precision Networks},
  journal   = {CoRR},
  volume    = {abs/1709.01134},
  year      = {2017},
  url       = {http://arxiv.org/abs/1709.01134},
  archivePrefix = {arXiv},
  eprint    = {1709.01134},
  timestamp = {Thu, 05 Oct 2017 09:42:55 +0200},
  biburl    = {http://dblp.org/rec/bib/journals/corr/abs-1709-01134},
  bibsource = {dblp computer science bibliography, http://dblp.org}
}

@inproceedings{2016-alwani,
  title={Fused-layer CNN accelerators},
  author={Alwani, Manoj and Chen, Han and Ferdman, Michael and Milder, Peter},
  booktitle={The 49th Annual IEEE/ACM International Symposium on Microarchitecture},
  pages={22},
  year={2016},
  organization={IEEE Press}
}

@inproceedings{2018-gudovskiy,
  title={DNN Feature Map Compression using Learned Representation over GF (2)},
  author={Gudovskiy, Denis and Hodgkinson, Alec and Rigazio, Luca},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  pages={0--0},
  year={2018}
}

@misc{cheng2020survey,
	title={A Survey of Model Compression and Acceleration for Deep Neural Networks}, 
	author={Yu Cheng and Duo Wang and Pan Zhou and Tao Zhang},
	year={2020},
	eprint={1710.09282},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@ARTICLE{1993-reed,
	author={R. {Reed}},
	journal={IEEE Transactions on Neural Networks}, 
	title={Pruning algorithms-a survey}, 
	year={1993},
	volume={4},
	number={5},
	pages={740-747},
	doi={10.1109/72.248452}}

@article{1995-ghosh,
	author = {Ghosh, Joydeep and Tumer, Kagan},
	title = {Structural Adaptation and Generalization in Supervised Feed-Forward Networks},
	year = {1994},
	issue_date = {1994},
	publisher = {Ablex Publishing Corp.},
	address = {USA},
	volume = {1},
	number = {4},
	issn = {1073-5828},
	journal = {J. Artif. Neural Netw.},
	month = nov,
	pages = {431–458},
	numpages = {28},
	keywords = {regression, generalization, perceptron, sparse connectivity, network evaluation}
}


@INPROCEEDINGS{2011-mrazova,
	author={I. {Mrázová} and Z. {Reitermanová}},
	booktitle={The 2011 International Joint Conference on Neural Networks}, 
	title={A new sensitivity-based pruning technique for feed-forward neural networks that improves generalization}, 
	year={2011},
	volume={},
	number={},
	pages={2143-2150},
	doi={10.1109/IJCNN.2011.6033493}}


@misc{2017-sharma,
	title={The Incredible Shrinking Neural Network: New Perspectives on Learning Representations Through The Lens of Pruning}, 
	author={Aditya Sharma and Nikolas Wolfe and Bhiksha Raj},
	year={2017},
	eprint={1701.04465},
	archivePrefix={arXiv},
	primaryClass={cs.NE}
}

@article{diering2017homer1a,
	title={Homer1a drives homeostatic scaling-down of excitatory synapses during sleep},
	author={Diering, Graham H and Nirujogi, Raja S and Roth, Richard H and Worley, Paul F and Pandey, Akhilesh and Huganir, Richard L},
	journal={Science},
	volume={355},
	number={6324},
	pages={511--515},
	year={2017},
	publisher={American Association for the Advancement of Science}
}

@misc{2019-ding,
	title={Global Sparse Momentum SGD for Pruning Very Deep Neural Networks}, 
	author={Xiaohan Ding and Guiguang Ding and Xiangxin Zhou and Yuchen Guo and Jungong Han and Ji Liu},
	year={2019},
	eprint={1909.12778},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-jorge,
	title={Progressive Skeletonization: Trimming more fat from a network at initialization}, 
	author={Pau de Jorge and Amartya Sanyal and Harkirat S. Behl and Philip H. S. Torr and Gregory Rogez and Puneet K. Dokania},
	year={2020},
	eprint={2006.09081},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}


@misc{2020-hayou,
	title={Pruning untrained neural networks: Principles and Analysis}, 
	author={Soufiane Hayou and Jean-Francois Ton and Arnaud Doucet and Yee Whye Teh},
	year={2020},
	eprint={2002.08797},
	archivePrefix={arXiv},
	primaryClass={stat.ML}
}

@misc{2020-guerra,
	title={Automatic Pruning for Quantized Neural Networks}, 
	author={Luis Guerra and Bohan Zhuang and Ian Reid and Tom Drummond},
	year={2020},
	eprint={2002.00523},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{shwartzziv2017opening,
	title={Opening the Black Box of Deep Neural Networks via Information}, 
	author={Ravid Shwartz-Ziv and Naftali Tishby},
	year={2017},
	eprint={1703.00810},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-prasanna,
	title={When {BERT} Plays the Lottery, All Tickets Are Winning}, 
	author={Sai Prasanna and Anna Rogers and Anna Rumshisky},
	year={2020},
	eprint={2005.00561},
	archivePrefix={arXiv},
	primaryClass={cs.CL}
}

@misc{2020-tanaka,
	title={Pruning neural networks without any data by iteratively conserving synaptic flow}, 
	author={Hidenori Tanaka and Daniel Kunin and Daniel L. K. Yamins and Surya Ganguli},
	year={2020},
	eprint={2006.05467},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{achille2019critical,
	title={Critical Learning Periods in Deep Neural Networks}, 
	author={Alessandro Achille and Matteo Rovere and Stefano Soatto},
	year={2019},
	eprint={1711.08856},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{li2020explaining,
	title={Towards Explaining the Regularization Effect of Initial Large Learning Rate in Training Neural Networks}, 
	author={Yuanzhi Li and Colin Wei and Tengyu Ma},
	year={2020},
	eprint={1907.04595},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-chen,
	title={The Lottery Ticket Hypothesis for Pre-trained BERT Networks}, 
	author={Tianlong Chen and Jonathan Frankle and Shiyu Chang and Sijia Liu and Yang Zhang and Zhangyang Wang and Michael Carbin},
	year={2020},
	eprint={2007.12223},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-verdenius,
	title={Pruning via Iterative Ranking of Sensitivity Statistics}, 
	author={Stijn Verdenius and Maarten Stol and Patrick Forré},
	year={2020},
	eprint={2006.00896},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{cosentino2019search,
	title={The Search for Sparse, Robust Neural Networks}, 
	author={Justin Cosentino and Federico Zaiter and Dan Pei and Jun Zhu},
	year={2019},
	eprint={1912.02386},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{sehwag2020hydra,
	title={HYDRA: Pruning Adversarially Robust Neural Networks}, 
	author={Vikash Sehwag and Shiqi Wang and Prateek Mittal and Suman Jana},
	year={2020},
	eprint={2002.10509},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2019-bartoldson,
	title={The Generalization-Stability Tradeoff In Neural Network Pruning}, 
	author={Brian R. Bartoldson and Ari S. Morcos and Adrian Barbu and Gordon Erlebacher},
	year={2020},
	eprint={1906.03728},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2019-voita,
	title={Analyzing Multi-Head Self-Attention: Specialized Heads Do the Heavy Lifting, the Rest Can Be Pruned}, 
	author={Elena Voita and David Talbot and Fedor Moiseev and Rico Sennrich and Ivan Titov},
	year={2019},
	eprint={1905.09418},
	archivePrefix={arXiv},
	primaryClass={cs.CL}
}

@misc{2019-michel,
	title={Are Sixteen Heads Really Better than One?}, 
	author={Paul Michel and Omer Levy and Graham Neubig},
	year={2019},
	eprint={1905.10650},
	archivePrefix={arXiv},
	primaryClass={cs.CL}
}

@inproceedings{jan2019iwslt,
	title={The IWSLT 2019 evaluation campaign},
	author={Jan, Niehues and Cattoni, Roldano and Sebastian, Stuker and Negri, Matteo and Turchi, Marco and Elizabeth, Salesky and Ramon, Sanabria and Loic, Barrault and Lucia, Specia and Federico, Marcello},
	booktitle={16th International Workshop on Spoken Language Translation 2019},
	year={2019}
}

@inproceedings{lison2019open,
	title={Open subtitles 2018: Statistical rescoring of sentence alignments in large, noisy parallel corpora},
	author={Lison, Pierre and Tiedemann, J{\"o}rg and Kouylekov, Milen and others},
	booktitle={LREC 2018, Eleventh International Conference on Language Resources and Evaluation},
	year={2019},
	organization={European Language Resources Association (ELRA)}
}

@misc{vaswani2017attention,
	title={Attention Is All You Need}, 
	author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
	year={2017},
	eprint={1706.03762},
	archivePrefix={arXiv},
	primaryClass={cs.CL}
}

@misc{child2019generating,
	title={Generating Long Sequences with Sparse Transformers}, 
	author={Rewon Child and Scott Gray and Alec Radford and Ilya Sutskever},
	year={2019},
	eprint={1904.10509},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@article{hoyer2004non,
	title={Non-negative matrix factorization with sparseness constraints},
	author={Hoyer, Patrik O},
	journal={Journal of machine learning research},
	volume={5},
	number={Nov},
	pages={1457--1469},
	year={2004}
}

@inproceedings{guo2019startransformer,
	title={Star-Transformer}, 
	author={Qipeng Guo and Xipeng Qiu and Pengfei Liu and Yunfan Shao and Xiangyang Xue and Zheng Zhang},
        booktitle={Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies},
        pages={1315--1325},
	year={2019},
        archivePrefix={arXiv},
        primaryClass={cs.CL},
        eprint={1902.09113}
}

@inproceedings{2020-yun,
	title={$O(n)$ Connections are Expressive Enough: Universal Approximability of Sparse Transformers}, 
	author={Chulhee Yun and Yin-Wen Chang and Srinadh Bhojanapalli and Ankit Singh Rawat and Sashank J. Reddi and Sanjiv Kumar},
        booktitle={Advances in Neural Information Processing Systems},
	year={2020}
}

@misc{beltagy2020longformer,
	title={Longformer: The Long-Document Transformer}, 
	author={Iz Beltagy and Matthew E. Peters and Arman Cohan},
	year={2020},
	eprint={2004.05150},
	archivePrefix={arXiv},
	primaryClass={cs.CL}
}

@article{de2017ultrastructural,
	title={Ultrastructural evidence for synaptic scaling across the wake/sleep cycle},
	author={De Vivo, Luisa and Bellesi, Michele and Marshall, William and Bushong, Eric A and Ellisman, Mark H and Tononi, Giulio and Cirelli, Chiara},
	journal={Science},
	volume={355},
	number={6324},
	pages={507--510},
	year={2017},
	publisher={American Association for the Advancement of Science}
}

@misc{2020-blalock,
  title={What is the state of neural network pruning?},
  author={Blalock, Davis and Ortiz, Jose Javier Gonzalez and Frankle, Jonathan and Guttag, John},
  eprint={2003.03033},
  archivePrefix={arXiv},
  primaryClass={cs.LG},  
  year={2020}
}

@misc{2019-yang,
	title={DeepHoyer: Learning Sparser Neural Network with Differentiable Scale-Invariant Sparsity Measures}, 
	author={Huanrui Yang and Wei Wen and Hai Li},
	year={2020},
	eprint={1908.09979},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-cho,
	title={ESPN: Extremely Sparse Pruned Networks}, 
	author={Minsu Cho and Ameya Joshi and Chinmay Hegde},
	year={2020},
	eprint={2006.15741},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}


@misc{2019-kuzmin,
	title={Taxonomy and Evaluation of Structured Compression of Convolutional Neural Networks}, 
	author={Andrey Kuzmin and Markus Nagel and Saurabh Pitre and Sandeep Pendyam and Tijmen Blankevoort and Max Welling},
	year={2019},
	eprint={1912.09802},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-kang,
	title={Operation-Aware Soft Channel Pruning using Differentiable Masks}, 
	author={Minsoo Kang and Bohyung Han},
	year={2020},
	eprint={2007.03938},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-li,
	title={Train Large, Then Compress: Rethinking Model Size for Efficient Training and Inference of Transformers}, 
	author={Zhuohan Li and Eric Wallace and Sheng Shen and Kevin Lin and Kurt Keutzer and Dan Klein and Joseph E. Gonzalez},
	year={2020},
	eprint={2002.11794},
	archivePrefix={arXiv},
	primaryClass={cs.CL}
}

@misc{2020-li-bits,
	title={Less bits is more: How pruning deep binary networks increases weight capacity},
	author={Yunqiang Li and Silvia Laura Pintea and Jan van Gemert},
	year={2021},
	url={https://openreview.net/forum?id=Hy8JM_Fvt5N}
}

@misc{kaplan2020scaling,
	title={Scaling Laws for Neural Language Models}, 
	author={Jared Kaplan and Sam McCandlish and Tom Henighan and Tom B. Brown and Benjamin Chess and Rewon Child and Scott Gray and Alec Radford and Jeffrey Wu and Dario Amodei},
	year={2020},
	eprint={2001.08361},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2019-gale,
	title={The State of Sparsity in Deep Neural Networks}, 
	author={Trevor Gale and Erich Elsen and Sara Hooker},
	year={2019},
	eprint={1902.09574},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-frankle-early,
	title={The Early Phase of Neural Network Training}, 
	author={Jonathan Frankle and David J. Schwab and Ari S. Morcos},
	year={2020},
	eprint={2002.10365},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2018-yu,
	title={NISP: Pruning Networks using Neuron Importance Score Propagation}, 
	author={Ruichi Yu and Ang Li and Chun-Fu Chen and Jui-Hsin Lai and Vlad I. Morariu and Xintong Han and Mingfei Gao and Ching-Yung Lin and Larry S. Davis},
	year={2018},
	eprint={1711.05908},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2020-frankle-linear,
	title={Linear Mode Connectivity and the Lottery Ticket Hypothesis}, 
	author={Jonathan Frankle and Gintare Karolina Dziugaite and Daniel M. Roy and Michael Carbin},
	year={2020},
	eprint={1912.05671},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-kusupati,
	title={Soft Threshold Weight Reparameterization for Learnable Sparsity}, 
	author={Aditya Kusupati and Vivek Ramanujan and Raghav Somani and Mitchell Wortsman and Prateek Jain and Sham Kakade and Ali Farhadi},
	year={2020},
	eprint={2002.03231},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2020-lin,
	title={Dynamic Model Pruning with Feedback}, 
	author={Tao Lin and Sebastian U. Stich and Luis Barba and Daniil Dmitriev and Martin Jaggi},
	year={2020},
	eprint={2006.07253},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2019-luo,
	title={AutoPruner: An End-to-End Trainable Filter Pruning Method for Efficient Deep Model Inference}, 
	author={Jian-Hao Luo and Jianxin Wu},
	year={2019},
	eprint={1805.08941},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2020-morcos,
	title={One ticket to win them all: generalizing lottery ticket initializations across datasets and optimizers}, 
	author={Ari S. Morcos and Haonan Yu and Michela Paganini and Yuandong Tian},
	year={2019},
	eprint={1906.02773},
	archivePrefix={arXiv},
	primaryClass={stat.ML}
}

%%%%%%
%% New refs added by Dan

@inproceedings{2015-han-learning,
 author = {Han, Song and Pool, Jeff and Tran, John and Dally, William},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett},
 pages = {1135--1143},
 publisher = {Curran Associates, Inc.},
 title = {Learning both Weights and Connections for Efficient Neural Network},
 url = {https://proceedings.neurips.cc/paper/2015/file/ae0eb3eed39d2bcef4622b2499a05fe6-Paper.pdf},
 volume = {28},
 year = {2015}
}

@article{2017-yu,
  title={Scalpel: Customizing dnn pruning to the underlying hardware parallelism},
  author={Yu, Jiecao and Lukefahr, Andrew and Palframan, David and Dasika, Ganesh and Das, Reetuparna and Mahlke, Scott},
  journal={ACM SIGARCH Computer Architecture News},
  volume={45},
  number={2},
  pages={548--560},
  year={2017},
  publisher={ACM New York, NY, USA}
}

@misc{2020-frankle-missing,
	title={Pruning Neural Networks at Initialization: Why are We Missing the Mark?}, 
	author={Jonathan Frankle and Gintare Karolina Dziugaite and Daniel M. Roy and Michael Carbin},
	year={2021},
	eprint={2009.08576},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2018-ye,
  title={Rethinking the smaller-norm-less-informative assumption in channel pruning of convolution layers},
  author={Ye, Jianbo and Lu, Xin and Lin, Zhe and Wang, James Z},
  year={2018},
  eprint={1802.00124},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@inproceedings{2018-gordon,
  title={Morphnet: Fast \& simple resource-constrained structure learning of deep networks},
  author={Gordon, Ariel and Eban, Elad and Nachum, Ofir and Chen, Bo and Wu, Hao and Yang, Tien-Ju and Choi, Edward},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1586--1595},
  year={2018}
}

@article{2018-liu-winograd,
  title={Efficient Sparse-Winograd Convolutional Neural Networks},
  author={Xingyu Liu and Jeff Pool and Song Han and William J. Dally},
  journal={International Conference on Learning Representations (ICLR)},
  year={2018}
}

@inproceedings{2015-zhang,
  title={Efficient and accurate approximations of nonlinear convolutional networks},
  author={Zhang, Xiangyu and Zou, Jianhua and Ming, Xiang and He, Kaiming and Sun, Jian},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and pattern Recognition},
  pages={1984--1992},
  year={2015}
}

@article{2014-denton,
  title={Exploiting linear structure within convolutional networks for efficient evaluation},
  author={Denton, Emily L and Zaremba, Wojciech and Bruna, Joan and LeCun, Yann and Fergus, Rob},
  journal={Advances in neural information processing systems},
  volume={27},
  pages={1269--1277},
  year={2014}
}

@INPROCEEDINGS{2016-he, 
	author={K. He and X. Zhang and S. Ren and J. Sun}, 
	booktitle={IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 
	title={Deep Residual Learning for Image Recognition}, 
	year={2016}, 
	pages={770--778}, 
}

@INPROCEEDINGS{2017-he-mask,  
	author={K. {He} and G. {Gkioxari} and P. {Dollár} and R. {Girshick}},  
	booktitle={2017 IEEE International Conference on Computer Vision (ICCV)},   
	title={Mask R-CNN},   
	year={2017},  
	volume={},  
	number={},  
	pages={2980-2988},  
	doi={10.1109/ICCV.2017.322}
}

@misc{2020-groenquist,
	title={Deep Learning for Post-Processing Ensemble Weather Forecasts}, 
	author={Peter Gr\"{o}nquist and Chengyuan Yao and Tal Ben-Nun and Nikoli Dryden and Peter Dueben and Shigang Li and Torsten Hoefler},
	year={2020},
	eprint={2005.08748},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@incollection{2010-ngiam,
	title = {Tiled convolutional neural networks},
	author = {J. Ngiam and Z. Chen and D. Chia and P. W. Koh and Q. V. Le and A. Y. Ng},
	booktitle = {Advances in Neural Information Processing Systems 23},
	pages = {1279--1287},
	year = {2010},
}

@inproceedings{2014-seide,
  title={1-bit stochastic gradient descent and its application to data-parallel distributed training of speech {DNNs}},
  author={Seide, Frank and Fu, Hao and Droppo, Jasha and Li, Gang and Yu, Dong},
  booktitle={Fifteenth Annual Conference of the International Speech Communication Association},
  year={2014}
}

@inproceedings{2015-strom,
  title={Scalable distributed {DNN} training using commodity {GPU} cloud computing},
  author={Strom, Nikko},
  booktitle={Sixteenth Annual Conference of the International Speech Communication Association},
  year={2015}
}

@inproceedings{2016-dryden,
  title={Communication quantization for data-parallel training of deep neural networks},
  author={Dryden, Nikoli and Moon, Tim and Jacobs, Sam Ade and Van Essen, Brian},
  booktitle={2nd Workshop on Machine Learning in HPC Environments (MLHPC)},
  pages={1--8},
  year={2016}  
}

@inproceedings{2017-aji,
  title={Sparse Communication for Distributed Gradient Descent},
  author={Aji, Alham Fikriand and Heafield, Kenneth},
  booktitle={Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  pages={440--445},
  year={2017},
  eprint={1704.05021},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@inproceedings{2017-sun,
  title={{meProp}: Sparsified back propagation for accelerated deep learning with reduced overfitting},
  author={Sun, Xu and Ren, Xuancheng and Ma, Shuming and Wang, Houfeng},
  booktitle={Proceedings of the Thirty-Fourth International Conference on Machine Learning},
  year={2017},
  eprint={1706.06197},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@inproceedings{2018-chen,
  title={{AdaComp}: Adaptive residual gradient compression for data-parallel distributed training},
  author={Chen, Chia-Yu and Choi, Jungwook and Brand, Daniel and Agrawal, Ankur and Zhang, Wei and Gopalakrishnan, Kailash},
  booktitle={32nd AAAI Conference on Artificial Intelligence},
  pages={2827--2835},
  year={2017},
  eprint={1712.02679},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@inproceedings{2018-lin,
  title={Deep gradient compression: Reducing the communication bandwidth for distributed training},
  author={Lin, Yujun and Han, Song and Mao, Huizi and Wang, Yu and Dally, William J},
  booktitle={Proceedings of the Sixth International Conference on Learning Representations},
  year={2018},
  eprint={1712.01887},
  archivePrefix={arXiv},
  primaryClass={cs.CV}
}

@inproceedings{2018-wangni,
  title={Gradient sparsification for communication-efficient distributed optimization},
  author={Wangni, Jianqiao and Wang, Jialei and Liu, Ji and Zhang, Tong},
  booktitle={Advances in Neural Information Processing Systems},
  pages={1299--1309},
  year={2018},
  eprint={1710.09854},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@inproceedings{2018-stich,
  title={Sparsified {SGD} with memory},
  author={Stich, Sebastian U and Cordonnier, Jean-Baptiste and Jaggi, Martin},
  booktitle={Advances in Neural Information Processing Systems},
  pages={4447--4458},
  year={2018},
  eprint={1809.07599},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@inproceedings{2018-alistarh,
  title={The convergence of sparsified gradient methods},
  author={Alistarh, Dan and Hoefler, Torsten and Johansson, Mikael and Konstantinov, Nikola and Khirirat, Sarit and Renggli, C{\'e}dric},
  booktitle={Advances in Neural Information Processing Systems},
  pages={5973--5983},
  year={2018},
  eprint={1809.10505},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@inproceedings{2018-jiang,
  title={A linear speedup analysis of distributed deep learning with sparse and quantized communication},
  author={Jiang, Peng and Agrawal, Gagan},
  booktitle={Advances in Neural Information Processing Systems},
  pages={2525--2536},
  year={2018}
}

@inproceedings{2018-tsuzaku,
  title={Variance-based gradient compression for efficient distributed deep learning},
  author={Tsuzuku, Yusuke and Imachi, Hiroto and Akiba, Takuya},
  booktitle={Proceedings of the Sixth International Conference on Learning Representations, Workshop Track},
  year={2018},
  eprint={1802.06058},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@inproceedings{2018-wang,
  title={{ATOMO}: Communication-efficient learning via atomic sparsification},
  author={Wang, Hongyi and Sievert, Scott and Liu, Shengchao and Charles, Zachary and Papailiopoulos, Dimitris and Wright, Stephen},
  booktitle={Advances in Neural Information Processing Systems},
  pages={9850--9861},
  year={2018},
  eprint={1806.04090},
  archivePrefix={arXiv},
  primaryClass={stat.ML}
}

@inproceedings{2020-gordon,
	title={Compressing {BERT}: Studying the Effects of Weight Pruning on Transfer Learning}, 
	author={Mitchell A. Gordon and Kevin Duh and Nicholas Andrews},
        booktitle={Proceedings of the 5th Workshop on Representation Learning for NLP},
        pages={143--155},
	year={2020},
        archivePrefix={arXiv},
        primaryClass={cs.CL},
        eprint={2002.08307}
}

@article{jones2006cognitive,
	title={Cognitive and neural plasticity in aging: general and task-specific limitations},
	author={Jones, Sari and Nyberg, Lars and Sandblom, Johan and Neely, Anna Stigsdotter and Ingvar, Martin and Petersson, Karl Magnus and B{\"a}ckman, Lars},
	journal={Neuroscience \& Biobehavioral Reviews},
	volume={30},
	number={6},
	pages={864--871},
	year={2006},
	publisher={Elsevier}
}


@misc{reuther2020survey,
	title={Survey of Machine Learning Accelerators}, 
	author={Albert Reuther and Peter Michaleas and Michael Jones and Vijay Gadepally and Siddharth Samsi and Jeremy Kepner},
	year={2020},
	eprint={2009.00993},
	archivePrefix={arXiv},
	primaryClass={cs.DC}
}

@ARTICLE{mlhwsurvey,
	author={V. {Sze} and Y. {Chen} and T. {Yang} and J. S. {Emer}},
	journal={Proceedings of the IEEE}, 
	title={Efficient Processing of Deep Neural Networks: A Tutorial and Survey}, 
	year={2017},
	volume={105},
	number={12},
	pages={2295-2329},
	doi={10.1109/JPROC.2017.2761740}}

@misc{2019-mehta,
	title={Sparse Transfer Learning via Winning Lottery Tickets}, 
	author={Rahul Mehta},
	year={2019},
	eprint={1905.07785},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2019-you,
	title={Gate Decorator: Global Filter Pruning Method for Accelerating Deep Convolutional Neural Networks}, 
	author={Zhonghui You and Kun Yan and Jinmian Ye and Meng Ma and Ping Wang},
	year={2019},
	eprint={1909.08174},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@inproceedings{2019-shi-gtopk,
  title={A distributed synchronous {SGD} algorithm with global Top-k sparsification for low bandwidth networks},
  author={Shi, Shaohuai and Wang, Qiang and Zhao, Kaiyong and Tang, Zhenheng and Wang, Yuxin and Huang, Xiang and Chu, Xiaowen},
  booktitle={2019 IEEE 39th International Conference on Distributed Computing Systems Workshop on Networks},
  pages={2238--2247},
  year={2019},
  eprint={1901.04359},
  archivePrefix={arXiv},
  primaryClass={cs.DC}
}

@inproceedings{2019-shi,
  title={A Convergence Analysis of Distributed {SGD} with Communication-Efficient Gradient Sparsification.},
  author={Shi, Shaohuai and Zhao, Kaiyong and Wang, Qiang and Tang, Zhenheng and Chu, Xiaowen},
  booktitle={Proceedings of the Twenty-Eighth International Joint Conference on Artificial Intelligence},
  pages={3411--3417},
  year={2019}
}

@inproceedings{2019-karimireddy,
  title={Error feedback fixes {SignSGD} and other gradient compression schemes},
  author={Karimireddy, Sai Praneeth and Rebjock, Quentin and Stich, Sebastian U and Jaggi, Martin},
  booktitle={Proceedings of the Thirty-sixth International Conference on Machine Learning},
  pages={3252--3261},
  year={2019},
  eprint={1901.09847},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@inproceedings{2019-tang,
  title={{DoubleSqueeze}: Parallel stochastic gradient descent with double-pass error-compensated compression},
  author={Tang, Hanlin and Yu, Chen and Lian, Xiangru and Zhang, Tong and Liu, Ji},
  booktitle={Proceedings of the Thirty-sixth International Conference on Machine Learning},
  pages={6155--6165},
  year={2019},
  eprint={1905.05957},
  archivePrefix={arXiv},
  primaryClass={cs.DC}
}

@inproceedings{2019-sun,
  title={Sparse gradient compression for distributed {SGD}},
  author={Sun, Haobo and Shao, Yingxia and Jiang, Jiawei and Cui, Bin and Lei, Kai and Xu, Yu and Wang, Jiang},
  booktitle={International Conference on Database Systems for Advanced Applications},
  pages={139--155},
  year={2019},
  organization={Springer}
}

@misc{lillicrap2019continuous,
	title={Continuous control with deep reinforcement learning}, 
	author={Timothy P. Lillicrap and Jonathan J. Hunt and Alexander Pritzel and Nicolas Heess and Tom Erez and Yuval Tassa and David Silver and Daan Wierstra},
	year={2019},
	eprint={1509.02971},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@inproceedings{2019-ivkin,
  title={Communication-efficient distributed {SGD} with sketching},
  author={Ivkin, Nikita and Rothchild, Daniel and Ullah, Enayat and Stoica, Ion and Arora, Raman and others},
  booktitle={Advances in Neural Information Processing Systems},
  pages={13144--13154},
  year={2019},
  eprint={1903.04488},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@inproceedings{2019-lim,
  title={{3LC}: Lightweight and Effective Traffic Compression for Distributed Machine Learning},
  author={Hyeontaek Lim and David Andersen and Michael Kaminsky},
  booktitle={Proceedings of the Conference on Systems and Machine Learning},
  year={2019},
  eprint={1802.07389},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@inproceedings{2019-renggli,
  title={{SparCML}: High-performance sparse communication for machine learning},
  author={Renggli, C{\`e}dric and Ashkboos, Saleh and Aghagolzadeh, Mehdi and Alistarh, Dan and Hoefler, Torsten},
  booktitle={Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  pages={1--15},
  year={2019},
  eprint={1802.08021},
  archivePrefix={arXiv},
  primaryClass={cs.DC}
}

@inproceedings{2020-sinha,
  title={Top-k Training of {GANs}: Improving {GAN} Performance by Throwing Away Bad Samples},
  author={Sinha, Samarth and Zhao, Zhengli and Goyal, Anirudh and Raffel, Colin A and Odena, Augustus},
  booktitle={Advances in Neural Information Processing Systems},
  year={2020},
  eprint={2002.06224},
  archivePrefix={arXiv},
  primaryClass={stat.ML}
}

@misc{2019-lieberwein,
	title={Provable Filter Pruning for Efficient Neural Networks}, 
	author={Lucas Liebenwein and Cenk Baykal and Harry Lang and Dan Feldman and Daniela Rus},
	year={2020},
	eprint={1911.07412},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@article{2018-baykal,
	title={Data-dependent coresets for compressing neural networks with applications to generalization bounds},
	author={Baykal, Cenk and Liebenwein, Lucas and Gilitschenski, Igor and Feldman, Dan and Rus, Daniela},
	journal={arXiv preprint arXiv:1804.05345},
	year={2018}
}

@inproceedings{2020-dutta,
  title={On the discrepancy between the theoretical analysis and practical implementations of compressed communication for distributed deep learning},
  author={Dutta, Aritra and Bergou, El Houcine and Abdelmoniem, Ahmed M and Ho, Chen-Yu and Sahu, Atal Narayan and Canini, Marco and Kalnis, Panos},
  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
  volume={34},
  number={04},
  pages={3817--3824},
  year={2020},
  eprint={1911.08250},
  archivePrefix={arXiv},
  primaryClass={cs.DC}
}

@misc{2019-he,
	title={AMC: AutoML for Model Compression and Acceleration on Mobile Devices}, 
	author={Yihui He and Ji Lin and Zhijian Liu and Hanrui Wang and Li-Jia Li and Song Han},
	year={2019},
	eprint={1802.03494},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2019-he-fpgm,
	title={Filter Pruning via Geometric Median for Deep Convolutional Neural Networks Acceleration}, 
	author={Yang He and Ping Liu and Ziwei Wang and Zhilan Hu and Yi Yang},
	year={2019},
	eprint={1811.00250},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2019-ding-c-sgd,
	title={Centripetal SGD for Pruning Very Deep Convolutional Networks with Complicated Structure}, 
	author={Xiaohan Ding and Guiguang Ding and Yuchen Guo and Jungong Han},
	year={2019},
	eprint={1904.03837},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@article{2020-basu,
  title={{Qsparse-local-SGD}: Distributed {SGD} with quantization, sparsification, and local computations},
  author={Basu, Debraj and Data, Deepesh and Karakus, Can and Diggavi, Suhas N},
  journal={IEEE Journal on Selected Areas in Information Theory},
  volume={1},
  number={1},
  pages={217--226},
  year={2020},
  eprint={1906.02367},
  archivePrefix={arXiv},
  primaryClass={stat.ML}
}

@inproceedings{2020-wang,
  title={{FFT}-based Gradient Sparsification for the Distributed Training of Deep Neural Networks},
  author={Wang, Linnan and Wu, Wei and Zhang, Junyu and Liu, Hang and Bosilca, George and Herlihy, Maurice and Fonseca, Rodrigo},
  booktitle={Proceedings of the 29th International Symposium on High-Performance Parallel and Distributed Computing},
  pages={113--124},
  year={2020}
}

@misc{2020-tang,
  title={Communication-efficient distributed deep learning: A comprehensive survey},
  author={Tang, Zhenheng and Shi, Shaohuai and Chu, Xiaowen and Wang, Wei and Li, Bo},
  year={2020},
  eprint={2003.06307},
  archivePrefix={arXiv},
  primaryClass={cs.DC}
}

@article{2018-konecny,
  title={Randomized distributed mean estimation: Accuracy vs. communication},
  author={Kone{\v{c}}n{\`y}, Jakub and Richt{\'a}rik, Peter},
  journal={Frontiers in Applied Mathematics and Statistics},
  volume={4},
  pages={62},
  year={2018},
  publisher={Frontiers},
  eprint={1611.07555},
  archivePrefix={arXiv},
  primaryClass={cs.DC}
}

@inproceedings{2017-suresh,
  title={Distributed mean estimation with limited communication},
  author={Suresh, Ananda Theertha and Felix, X Yu and Kumar, Sanjiv and McMahan, H Brendan},
  booktitle={International Conference on Machine Learning},
  pages={3329--3337},
  year={2017},
  eprint={1611.00429},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@inproceedings{2019-huang,
  title={Optimal Sparsity-Sensitive Bounds for Distributed Mean Estimation},
  author={Huang, Ziyue and Yilei, Wang and Yi, Ke and others},
  booktitle={Advances in Neural Information Processing Systems},
  pages={6371--6381},
  year={2019}
}

@article{2014-srivastava,
	author  = {Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov},
	title   = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
	journal = {Journal of Machine Learning Research},
	year    = {2014},
	volume  = {15},
	number  = {56},
	pages   = {1929-1958}
}

@misc{2012-hinton,
	title={Improving neural networks by preventing co-adaptation of feature detectors}, 
	author={Geoffrey E. Hinton and Nitish Srivastava and Alex Krizhevsky and Ilya Sutskever and Ruslan R. Salakhutdinov},
	year={2012},
	eprint={1207.0580},
	archivePrefix={arXiv},
	primaryClass={cs.NE}
}

@inproceedings{2020-hu,
	author = {Hu, Yuwei and Ye, Zihao and Wang, Minjie and Yu, Jiali and Zheng, Da and Li, Mu and Zhang, Zheng and Zhang, Zhiru and Wang, Yida},
	title = {FeatGraph: A Flexible and Efficient Backend for Graph Neural Network Systems},
	year = {2020},
	isbn = {9781728199986},
	publisher = {IEEE Press},
	abstract = {Graph neural networks (GNNs) are gaining popularity as a promising approach to machine learning on graphs. Unlike traditional graph workloads where each vertex/edge is associated with a scalar, GNNs attach a feature tensor to each vertex/edge. This additional feature dimension, along with consequently more complex vertex- and edge-wise computations, has enormous implications on locality and parallelism, which existing graph processing systems fail to exploit.This paper proposes FeatGraph to accelerate GNN workloads by co-optimizing graph traversal and feature dimension computation. FeatGraph provides a flexible programming interface to express diverse GNN models by composing coarse-grained sparse templates with fine-grained user-defined functions (UDFs) on each vertex/edge. FeatGraph incorporates optimizations for graph traversal into the sparse templates and allows users to specify optimizations for UDFs with a feature dimension schedule (FDS). FeatGraph speeds up end-to-end GNN training and inference by up to 32\texttimes{} on CPU and 7\texttimes{} on GPU.},
	booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
	articleno = {71},
	numpages = {13},
	location = {Atlanta, Georgia},
	series = {SC '20}
}


@inproceedings{2020-kurtz,
  title={Inducing and Exploiting Activation Sparsity for Fast Inference on Deep Neural Networks},
  author={Kurtz, Mark and Kopinsky, Justin and Gelashvili, Rati and Matveev, Alexander and Carr, John and Goin, Michael and Leiserson, William and Moore, Sage and Shavit, Nir and Alistarh, Dan},
  booktitle={International Conference on Machine Learning},
  pages={5533--5543},
  year={2020},
  organization={PMLR}
}


@inproceedings{2019-dong,
  author    = {Xiao Dong and
               Lei Liu and
               Guangli Li and
               Jiansong Li and
               Peng Zhao and
               Xueying Wang and
               Xiaobing Feng},
  title     = {Exploiting the input sparsity to accelerate deep neural networks:
               poster},
  booktitle = {Proceedings of the 24th {ACM} {SIGPLAN} Symposium on Principles and
               Practice of Parallel Programming, PPoPP 2019, Washington, DC, USA,
               February 16-20, 2019},
  pages     = {401--402},
  year      = {2019},
  url       = {https://doi.org/10.1145/3293883.3295713},
  doi       = {10.1145/3293883.3295713},
  timestamp = {Fri, 08 Feb 2019 09:59:28 +0100},
  biburl    = {https://dblp.org/rec/bib/conf/ppopp/DongLLLZW019},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{2019-georgiadis,
  title={Accelerating Convolutional Neural Networks via Activation Map Compression},
  author={Georgiadis, Georgios},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={7085--7095},
  year={2019}
}

@inproceedings{2013-baldi,
	author = {Baldi, Pierre and Sadowski, Peter J},
	booktitle = {Advances in Neural Information Processing Systems},
	editor = {C. J. C. Burges and L. Bottou and M. Welling and Z. Ghahramani and K. Q. Weinberger},
	pages = {2814--2822},
	publisher = {Curran Associates, Inc.},
	title = {Understanding Dropout},
	url = {https://proceedings.neurips.cc/paper/2013/file/71f6278d140af599e06ad9bf1ba03cb0-Paper.pdf},
	volume = {26},
	year = {2013}
}

@InProceedings{2016-gal, 
	title = {Dropout as a Bayesian Approximation: Representing Model Uncertainty in Deep Learning}, 
	author = {Yarin Gal and Zoubin Ghahramani}, 
	booktitle = {Proceedings of The 33rd International Conference on Machine Learning}, 
	pages = {1050--1059}, 
	year = {2016}, 
	editor = {Maria Florina Balcan and Kilian Q. Weinberger}, 
	volume = {48}, 
	series = {Proceedings of Machine Learning Research}, 
	address = {New York, New York, USA}, 
	month = {20--22 Jun}, 
	publisher = {PMLR}, 
	url = {http://proceedings.mlr.press/v48/gal16.html}
}

@inproceedings{2017-gal,
	author = {Gal, Yarin and Hron, Jiri and Kendall, Alex},
	booktitle = {Advances in Neural Information Processing Systems},
	editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
	pages = {3581--3590},
	publisher = {Curran Associates, Inc.},
	title = {Concrete Dropout},
	url = {https://proceedings.neurips.cc/paper/2017/file/84ddfb34126fc3a48ee38d7044e87276-Paper.pdf},
	volume = {30},
	year = {2017}
}

@article{2017-maddison,
	title={The Concrete Distribution: A Continuous Relaxation of Discrete Random Variables}, 
	author={Chris J. Maddison and Andriy Mnih and Yee Whye Teh},
	year={2017},
    journal={International Conference on Learning Representations (ICLR)}
}

@article{2017-krueger,
	title={Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations}, 
	author={David Krueger and Tegan Maharaj and János Kramár and Mohammad Pezeshki and Nicolas Ballas and Nan Rosemary Ke and Anirudh Goyal and Yoshua Bengio and Aaron Courville and Chris Pal},
	year={2017},
    journal={International Conference on Learning Representations (ICLR)}
}

@misc{2013-kingma,
	title={Auto-encoding variational bayes},
	author={Kingma, Diederik P and Welling, Max},
	year={2013},
        eprint={1312.6114},
        archivePrefix={arXiv},
        primaryClass={cs.LG}
}

@article{2001-tipping,
	title={Sparse Bayesian learning and the relevance vector machine},
	author={Tipping, Michael E},
	journal={Journal of machine learning research},
	volume={1},
	number={Jun},
	pages={211--244},
	year={2001}
}

@inproceedings{2014-rezende,
	title={Stochastic backpropagation and variational inference in deep latent gaussian models},
	author={Rezende, Danilo Jimenez and Mohamed, Shakir and Wierstra, Daan},
	booktitle={International Conference on Machine Learning},
	volume={2},
	year={2014}
}

@inproceedings{2015-kingma,
	author = {Kingma, Diederik P and Salimans, Tim and Welling, Max},
	booktitle = {Advances in Neural Information Processing Systems},
	editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett},
	pages = {2575--2583},
	publisher = {Curran Associates, Inc.},
	title = {Variational Dropout and the Local Reparameterization Trick},
	url = {https://proceedings.neurips.cc/paper/2015/file/bc7316929fe1545bf0b98d114ee3ecb8-Paper.pdf},
	volume = {28},
	year = {2015}
}

@InProceedings{2013-wan, 
	title = {Regularization of Neural Networks using DropConnect}, 
	author = {Li Wan and Matthew Zeiler and Sixin Zhang and Yann Le Cun and Rob Fergus}, 
	booktitle = {Proceedings of the 30th International Conference on Machine Learning}, 
	pages = {1058--1066}, 
	year = {2013}, 
	editor = {Sanjoy Dasgupta and David McAllester}, 
	volume = {28}, 
	number = {3}, 
	series = {Proceedings of Machine Learning Research}, 
	address = {Atlanta, Georgia, USA}, 
	month = {17--19 Jun}, 
	publisher = {PMLR}, 
	pdf = {http://proceedings.mlr.press/v28/wan13.pdf}, 
	url = {http://proceedings.mlr.press/v28/wan13.html}
}

@inproceedings{2018-ghiasi,
	author = {Ghiasi, Golnaz and Lin, Tsung-Yi and Le, Quoc V},
	booktitle = {Advances in Neural Information Processing Systems},
	editor = {S. Bengio and H. Wallach and H. Larochelle and K. Grauman and N. Cesa-Bianchi and R. Garnett},
	pages = {10727--10737},
	publisher = {Curran Associates, Inc.},
	title = {DropBlock: A regularization method for convolutional networks},
	url = {https://proceedings.neurips.cc/paper/2018/file/7edcfb2d8f6a659ef4cd1e6c9b6d7079-Paper.pdf},
	volume = {31},
	year = {2018}
}

@article{2017-larsson,
	title={FractalNet: Ultra-Deep Neural Networks without Residuals}, 
	author={Gustav Larsson and Michael Maire and Gregory Shakhnarovich},
	year={2017},
    journal={International Conference on Learning Representations (ICLR)}
}

@misc{2016-ba-layernorm,
  title={Layer normalization},
  author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E},
  year={2016},
  eprint={1607.06450},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@inproceedings{2014-goodfellow,
  title={Generative adversarial nets},
  author={Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},
  booktitle={Advances in neural information processing systems},
  pages={2672--2680},
  year={2014},
  eprint={1406.2661},
  archivePrefix={arXiv},
  primaryClass={stat.ML}
}

@inproceedings{2014-szegedy,
	title	= {Going Deeper with Convolutions},
	author	= {Christian Szegedy and Wei Liu and Yangqing Jia and Pierre Sermanet and Scott Reed and Dragomir Anguelov and Dumitru Erhan and Vincent Vanhoucke and Andrew Rabinovich},
	year	= {2015},
	URL	= {http://arxiv.org/abs/1409.4842},
	booktitle	= {Computer Vision and Pattern Recognition (CVPR)}
}

@misc{2020-ganesh,
  title={Compressing large-scale transformer-based models: A case study on {BERT}},
  author={Ganesh, Prakhar and Chen, Yao and Lou, Xin and Khan, Mohammad Ali and Yang, Yin and Chen, Deming and Winslett, Marianne and Sajjad, Hassan and Nakov, Preslav},
  year={2020},
  eprint={2002.11985},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@misc{2020-gupta,
  title={Compression of Deep Learning Models for Text: A Survey},
  author={Gupta, Manish and Agrawal, Puneet},
  year={2020},
  eprint={2008.05221},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@misc{2019-guo,
  title={Reweighted proximal pruning for large-scale language representation},
  author={Guo, Fu-Ming and Liu, Sijia and Mungall, Finlay S and Lin, Xue and Wang, Yanzhi},
  year={2019},
  archivePrefix={arXiv},
  primaryClass={cs.LG},
  eprint={1909.12486}
}

@misc{2019-lample,
	title={Large Memory Layers with Product Keys}, 
	author={Guillaume Lample and Alexandre Sablayrolles and Marc'Aurelio Ranzato and Ludovic Denoyer and Hervé Jégou},
	year={2019},
	eprint={1907.05242},
	archivePrefix={arXiv},
	primaryClass={cs.CL}
}

@inproceedings{2020-lin-tformer,
  title={Pruning Redundant Mappings in Transformer Models via Spectral-Normalized Identity Prior},
  author={Lin, Zi and Liu, Jeremiah Zhe and Yang, Zi and Hua, Nan and Roth, Dan},
  booktitle={Findings of the Association for Computational Linguistics: EMNLP 2020},
  pages={719--730},
  year={2020},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  eprint={2010.01791}
}

@article{jordan1994hierarchical,
	title={Hierarchical mixtures of experts and the EM algorithm},
	author={Jordan, Michael I and Jacobs, Robert A},
	journal={Neural computation},
	volume={6},
	number={2},
	pages={181--214},
	year={1994},
	publisher={MIT Press}
}

@article{jacobs1991adaptive,
	title={Adaptive mixtures of local experts},
	author={Jacobs, Robert A and Jordan, Michael I and Nowlan, Steven J and Hinton, Geoffrey E},
	journal={Neural computation},
	volume={3},
	number={1},
	pages={79--87},
	year={1991},
	publisher={MIT Press}
}

@inproceedings{2020-fan,
  title={Reducing transformer depth on demand with structured dropout},
  author={Fan, Angela and Grave, Edouard and Joulin, Armand},
  booktitle={Proceedings of the Eighth International Conference on Learning Representations},
  year={2020},
  archivePrefix={arXiv},
  primaryClass={cs.LG},
  eprint={1909.11556}
}

@inproceedings{2020-wang-tformer,
  title={Structured pruning of large language models},
  author={Wang, Ziheng and Wohlwend, Jeremy and Lei, Tao},
  booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  pages={6151--6162},
  year={2020},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  eprint={1910.04732}
}

@misc{2019-molchanov,
	title={Importance Estimation for Neural Network Pruning}, 
	author={Pavlo Molchanov and Arun Mallya and Stephen Tyree and Iuri Frosio and Jan Kautz},
	year={2019},
	eprint={1906.10771},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@inproceedings{2020-dosovitskiy,
  title={An image is worth 16x16 words: Transformers for image recognition at scale},
  author={Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and others},
  booktitle={Proceedings of the Ninth International Conference on Learning Representations},
  year={2021},
  eprint={2010.11929},
  archivePrefix={arXiv},
  primaryClass={cs.CV}
}

@misc{2019-liu-dynamic,
	title={Dynamic Sparse Graph for Efficient Deep Learning}, 
	author={Liu Liu and Lei Deng and Xing Hu and Maohua Zhu and Guoqi Li and Yufei Ding and Yuan Xie},
	year={2019},
	eprint={1810.00859},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@inproceedings{2019-devlin,
  title={{BERT}: Pre-training of deep bidirectional transformers for language understanding},
  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
  booktitle={Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
  pages={4171--4186},
  year={2019}
}

@misc{2021-fedus,
  title={Switch Transformers: Scaling to trillion parameter models with simple and efficient sparsity},
  author={Fedus, William and Zoph, Barret and Shazeer, Noam},
  year={2021},
  eprint={2101.03961},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@misc{2019-liu,
  title={{RoBERTa}: A robustly optimized {BERT} pretraining approach},
  author={Liu, Yinhan and Ott, Myle and Goyal, Naman and Du, Jingfei and Joshi, Mandar and Chen, Danqi and Levy, Omer and Lewis, Mike and Zettlemoyer, Luke and Stoyanov, Veselin},
  year={2019},
  eprint={1907.11692},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@article{2021-rogers,
  title={A primer in {BERTology}: What we know about how bert works},
  author={Rogers, Anna and Kovaleva, Olga and Rumshisky, Anna},
  journal={Transactions of the Association for Computational Linguistics},
  volume={8},
  pages={842--866},
  year={2021},
  publisher={MIT Press},
  eprint={2002.12327},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@inproceedings{2018-yu-epsresnets,
  title={Learning strict identity mappings in deep residual networks},
  author={Yu, Xin and Yu, Zhiding and Ramalingam, Srikumar},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={4432--4440},
  year={2018},
  eprint={1804.01661},
  archivePrefix={arXiv},
  primaryClass={cs.CV}
}

@inproceedings{2018-miyato,
  title={Spectral normalization for generative adversarial networks},
  author={Miyato, Takeru and Kataoka, Toshiki and Koyama, Masanori and Yoshida, Yuichi},
  booktitle={Proceedings of the Sixth International Conference on Learning Representations},
  year={2018},
  eprint={1802.05957},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@INPROCEEDINGS {2016-szegedy,
	author = {C. Szegedy and V. Vanhoucke and S. Ioffe and J. Shlens and Z. Wojna},
	booktitle = {2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
	title = {Rethinking the Inception Architecture for Computer Vision},
	year = {2016},
	volume = {},
	issn = {1063-6919},
	pages = {2818-2826},
	keywords = {convolution;computer architecture;training;computational efficiency;computer vision;benchmark testing;computational modeling},
	doi = {10.1109/CVPR.2016.308},
	url = {https://doi.ieeecomputersociety.org/10.1109/CVPR.2016.308},
	publisher = {IEEE Computer Society},
	address = {Los Alamitos, CA, USA},
	month = {jun}
}

@inproceedings{2019-wang-glue,
  title={{GLUE}: A multi-task benchmark and analysis platform for natural language understanding},
  author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},
  booktitle={Proceedings of the Seventh International Conference on Learning Representations},
  year={2019},
  eprint={1804.07461},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@inproceedings{2013-socher,
  title={Recursive deep models for semantic compositionality over a sentiment treebank},
  author={Socher, Richard and Perelygin, Alex and Wu, Jean and Chuang, Jason and Manning, Christopher D and Ng, Andrew Y and Potts, Christopher},
  booktitle={Proceedings of the 2013 conference on empirical methods in natural language processing},
  pages={1631--1642},
  year={2013}
}

@inproceedings{2005-dolan,
  title={Automatically constructing a corpus of sentential paraphrases},
  author={Dolan, William B and Brockett, Chris},
  booktitle={Proceedings of the Third International Workshop on Paraphrasing (IWP2005)},
  year={2005}
}

@inproceedings{2018-williams,
  title={A broad-coverage challenge corpus for sentence understanding through inference},
  author={Williams, Adina and Nangia, Nikita and Bowman, Samuel R},
  booktitle={Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies},
  year={2018},
  eprint={1704.05426},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@article{2019-warstadt,
  title={Neural network acceptability judgments},
  author={Warstadt, Alex and Singh, Amanpreet and Bowman, Samuel R},
  journal={Transactions of the Association for Computational Linguistics},
  volume={7},
  pages={625--641},
  year={2019},
  publisher={MIT Press},
  eprint={1805.12471},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@misc{2020-tay,
  title={Efficient transformers: A survey},
  author={Tay, Yi and Dehghani, Mostafa and Bahri, Dara and Metzler, Donald},
  year={2020},
  eprint={2009.06732},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@inproceedings{2021-tay,
  title={Long Range Arena: A Benchmark for Efficient Transformers},
  author={Tay, Yi and Dehghani, Mostafa and Abnar, Samira and Shen, Yikang and Bahri, Dara and Pham, Philip and Rao, Jinfeng and Yang, Liu and Ruder, Sebastian and Metzler, Donald},
  booktitle={Proceedings of the Ninth International Conference on Learning Representations},
  year={2021},
  eprint={2011.04006},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@inproceedings{2020-zaheer,
  title={Big bird: Transformers for longer sequences},
  author={Zaheer, Manzil and Guruganesh, Guru and Dubey, Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon, Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others},
  booktitle={Advances in Neural Information Processing Systems},
  year={2020},
  eprint={2007.14062},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@inproceedings{2019-correia,
  title={Adaptively sparse transformers},
  author={Correia, Gon{\c{c}}alo M and Niculae, Vlad and Martins, Andr{\'e} FT},
  booktitle={Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
  year={2019},
  eprint={1909.00015},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@inproceedings{2019-cui,
  title={Fine-tune {BERT} with Sparse Self-Attention Mechanism},
  author={Cui, Baiyun and Li, Yingming and Chen, Ming and Zhang, Zhongfei},
  booktitle={Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
  pages={3539--3544},
  year={2019}
}

@misc{2019-zhao,
  title={Explicit Sparse Transformer: Concentrated Attention Through Explicit Selection},
  author={Zhao, Guangxiang and Lin, Junyang and Zhang, Zhiyuan and Ren, Xuancheng and Su, Qi and Sun, Xu},
  year={2019},
  eprint={1912.11637},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@inproceedings{2016-martins,
  title={From softmax to sparsemax: A sparse model of attention and multi-label classification},
  author={Martins, Andre and Astudillo, Ramon},
  booktitle={International Conference on Machine Learning},
  pages={1614--1623},
  year={2016},
  eprint={1602.02068},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@inproceedings{2017-niculae,
  title={A regularized framework for sparse and structured neural attention},
  author={Niculae, Vlad and Blondel, Mathieu},
  booktitle={Advances in neural information processing systems},
  pages={3338--3348},
  year={2017},
  eprint={1705.07704},
  archivePrefix={arXiv},
  primaryClass={stat.ML}
}

@inproceedings{2018-malaviya,
  title={Sparse and constrained attention for neural machine translation},
  author={Malaviya, Chaitanya and Ferreira, Pedro and Martins, Andr{\'e} FT},
  booktitle={Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
  year={2018},
  eprint={1805.08241},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@inproceedings{2018-parmar,
  title={Image Transformer},
  author={Parmar, Niki and Vaswani, Ashish and Uszkoreit, Jakob and Kaiser, Lukasz and Shazeer, Noam and Ku, Alexander and Tran, Dustin},
  booktitle={International Conference on Machine Learning},
  pages={4055--4064},
  year={2018},
  eprint={1802.05751},
  archivePrefix={arXiv},
  primaryClass={cs.CV}
}

@inproceedings{2019-tenney,
  title={{BERT} rediscovers the classical {NLP} pipeline},
  author={Tenney, Ian and Das, Dipanjan and Pavlick, Ellie},
  booktitle={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
  pages={4593--4601},
  year={2019},
  eprint={1905.05950},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@inproceedings{2019-loshchilov,
  title={Decoupled weight decay regularization},
  author={Loshchilov, Ilya and Hutter, Frank},
  booktitle={Proceedings of the Seventh International Conference on Learning Representations},
  year={2019},
  archivePrefix={arXiv},
  eprint={1711.05101},
  eprintclass={cs.LG}
}

@InProceedings{2016-huang,
	author="Huang, Gao
	and Sun, Yu
	and Liu, Zhuang
	and Sedra, Daniel
	and Weinberger, Kilian Q.",
	editor="Leibe, Bastian
	and Matas, Jiri
	and Sebe, Nicu
	and Welling, Max",
	title="Deep Networks with Stochastic Depth",
	booktitle="Computer Vision -- ECCV 2016",
	year="2016",
	publisher="Springer International Publishing",
	address="Cham",
	pages="646--661",
	isbn="978-3-319-46493-0"
}

@book{2020-russell,
  author={Russell, Stuart and Norvig, Peter},
  title={Artificial Intelligence: A Modern Approach},
  year={2020},
  publisher={Prentice Hall Press},
  edition={4th}
}                                                                                                                                                                                                               
@book{2009-nilsson,
  title={The quest for artificial intelligence: A history of ideas and achievements},
  author={Nilsson, Nils J},
  year={2009},
  publisher={Cambridge University Press}
}

@misc{2019-hooker,
  title={What Do Compressed Deep Neural Networks Forget?},
  author={Hooker, Sara and Courville, Aaron and Clark, Gregory and Dauphin, Yann and Frome, Andrea},
  year={2019},
  eprint={1911.05248},
  archiveprefix={arXiv},
  primaryClass={cs.LG}
}

@misc{2020-hooker,
  title={Characterising bias in compressed models},
  author={Hooker, Sara and Moorosi, Nyalleng and Clark, Gregory and Bengio, Samy and Denton, Emily},
  year={2020},
  eprint={2010.03058},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@inproceedings{2019-hendrycks-imagenetc,
  title={Benchmarking neural network robustness to common corruptions and perturbations},
  author={Hendrycks, Dan and Dietterich, Thomas},
  booktitle={Proceedings of the Seventh International Conference on Learning Representations},
  year={2019},
  eprint={1903.12261},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@misc{2019-hendrycks-imageneta,
  title={Natural adversarial examples},
  author={Hendrycks, Dan and Zhao, Kevin and Basart, Steven and Steinhardt, Jacob and Song, Dawn},
  year={2019},
  eprint={1907.07174},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}

@inproceedings{2015-liu-celeba,
  title={Deep learning face attributes in the wild},
  author={Liu, Ziwei and Luo, Ping and Wang, Xiaogang and Tang, Xiaoou},
  booktitle={Proceedings of the IEEE international conference on computer vision},
  pages={3730--3738},
  year={2015},
  eprint={1411.7766},
  archivePrefix={arXiv},
  primaryClass={cs.CV}
}

@misc{2019-tan,
	title={MnasNet: Platform-Aware Neural Architecture Search for Mobile}, 
	author={Mingxing Tan and Bo Chen and Ruoming Pang and Vijay Vasudevan and Mark Sandler and Andrew Howard and Quoc V. Le},
	year={2019},
	eprint={1807.11626},
	archivePrefix={arXiv},
	primaryClass={cs.CV}
}

@misc{2020-tan,
	title={EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks}, 
	author={Mingxing Tan and Quoc V. Le},
	year={2020},
	eprint={1905.11946},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@article{2018-bianco,
	title={Benchmark Analysis of Representative Deep Neural Network Architectures},
	volume={6},
	ISSN={2169-3536},
	url={http://dx.doi.org/10.1109/ACCESS.2018.2877890},
	DOI={10.1109/access.2018.2877890},
	journal={IEEE Access},
	publisher={Institute of Electrical and Electronics Engineers (IEEE)},
	author={Bianco, Simone and Cadene, Remi and Celona, Luigi and Napoletano, Paolo},
	year={2018},
	pages={64270–64277}
}

@misc{2020-tang-scop,
      title={SCOP: Scientific Control for Reliable Neural Network Pruning}, 
      author={Yehui Tang and Yunhe Wang and Yixing Xu and Dacheng Tao and Chunjing Xu and Chao Xu and Chang Xu},
      year={2021},
      eprint={2010.10732},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@book{2007-grunwald,
	title={The minimum description length principle},
	author={Gr{\"u}nwald, Peter D},
	year={2007},
	publisher={MIT press}
}

@inproceedings{1993-hinton,
	title={Keeping the neural networks simple by minimizing the description length of the weights},
	author={Hinton, Geoffrey E and Van Camp, Drew},
	booktitle={Proceedings of the sixth annual conference on Computational learning theory},
	pages={5--13},
	year={1993}
}

@misc{bennun2019modular,
	title={A Modular Benchmarking Infrastructure for High-Performance and Reproducible Deep Learning}, 
	author={Tal Ben-Nun and Maciej Besta and Simon Huber and Alexandros Nikolaos Ziogas and Daniel Peter and Torsten Hoefler},
	year={2019},
	eprint={1901.10183},
	archivePrefix={arXiv},
	primaryClass={cs.DC}
}

@article{1992-nowlan,
	title={Simplifying neural networks by soft weight-sharing},
	author={Nowlan, Steven J and Hinton, Geoffrey E},
	journal={Neural computation},
	volume={4},
	number={4},
	pages={473--493},
	year={1992},
	publisher={MIT Press}
}

@misc{2019-mattson,
	title={MLPerf Training Benchmark}, 
	author={Peter Mattson and Christine Cheng and Cody Coleman and Greg Diamos and Paulius Micikevicius and David Patterson and Hanlin Tang and Gu-Yeon Wei and Peter Bailis and Victor Bittorf and David Brooks and Dehao Chen and Debojyoti Dutta and Udit Gupta and Kim Hazelwood and Andrew Hock and Xinyuan Huang and Atsushi Ike and Bill Jia and Daniel Kang and David Kanter and Naveen Kumar and Jeffery Liao and Guokai Ma and Deepak Narayanan and Tayo Oguntebi and Gennady Pekhimenko and Lillian Pentecost and Vijay Janapa Reddi and Taylor Robie and Tom St. John and Tsuguchika Tabaru and Carole-Jean Wu and Lingjie Xu and Masafumi Yamazaki and Cliff Young and Matei Zaharia},
	year={2020},
	eprint={1910.01500},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}

@misc{2014-chetlur,
  title={{cuDNN}: Efficient primitives for deep learning},
  author={Chetlur, Sharan and Woolley, Cliff and Vandermersch, Philippe and Cohen, Jonathan and Tran, John and Catanzaro, Bryan and Shelhamer, Evan},
  year={2014},
  eprint={1410.0759},
  archivePrefix={arXiv},
  primaryClass={cs.NE}
}

@inproceedings{zhou2021effective,
  title={Effective Sparsification of Neural Networks with Global Sparsity Constraint},
  author={Zhou, Xiao and Zhang, Weizhong and Xu, Hang and Zhang, Tong},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={3599--3608},
  year={2021}
}

@article{zhou2021efficient,
  title={Efficient Neural Network Training via Forward and Backward Propagation Sparsification},
  author={Zhou, Xiao and Zhang, Weizhong and Chen, Zonghao and Diao, Shizhe and Zhang, Tong},
  journal={Advances in Neural Information Processing Systems},
  year={2021}
}