-
Notifications
You must be signed in to change notification settings - Fork 3
/
sparsity.bib
4551 lines (4049 loc) · 171 KB
/
sparsity.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@Article{ karl_hierarchical_models,
author = {K.J. Friston},
title = {Hierarchical Models in the Brain},
journal = {PLOS Computational Biology},
year = {2008},
volume = {4},
number = {11},
pages = {e1000211},
doi = {10.1371/journal.pcbi.1000211},
pdf = {/spm/doc/papers/Hierarchical_Models_in_the_Brain.pdf},
keyword = {DEM}
}
@inproceedings{1991-moody,
title={Note on generalization, regularization and architecture selection in nonlinear learning systems},
author={Moody, John E},
booktitle={Neural Networks for Signal Processing Proceedings of the 1991 IEEE Workshop},
pages={1--10},
year={1991},
organization={IEEE}
}
@inproceedings{1994-hansen,
title={Controlled growth of cascade correlation nets},
author={Hansen, Lars Kai and others},
booktitle={International Conference on Artificial Neural Networks},
pages={797--800},
year={1994},
organization={Springer}
}
@INPROCEEDINGS{2020-qin,
author={E. {Qin} and A. {Samajdar} and H. {Kwon} and V. {Nadella} and S. {Srinivasan} and D. {Das} and B. {Kaul} and T. {Krishna}},
booktitle={2020 IEEE International Symposium on High Performance Computer Architecture (HPCA)},
title={SIGMA: A Sparse and Irregular GEMM Accelerator with Flexible Interconnects for DNN Training},
year={2020},
volume={},
number={},
pages={58-70},
doi={10.1109/HPCA47549.2020.00015}}
@article{lillicrap2020backpropagation,
title={Backpropagation and the brain},
author={Lillicrap, Timothy P and Santoro, Adam and Marris, Luke and Akerman, Colin J and Hinton, Geoffrey},
journal={Nature Reviews Neuroscience},
pages={1--12},
year={2020},
publisher={Nature Publishing Group}
}
@inproceedings{2019-jin,
author = {Jin, Sian and Di, Sheng and Liang, Xin and Tian, Jiannan and Tao, Dingwen and Cappello, Franck},
title = {DeepSZ: A Novel Framework to Compress Deep Neural Networks by Using Error-Bounded Lossy Compression},
year = {2019},
isbn = {9781450366700},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3307681.3326608},
doi = {10.1145/3307681.3326608},
abstract = {Today's deep neural networks (DNNs) are becoming deeper and wider because of increasing demand on the analysis quality and more and more complex applications to resolve. The wide and deep DNNs, however, require large amounts of resources (such as memory, storage, and I/O), significantly restricting their utilization on resource-constrained platforms. Although some DNN simplification methods (such as weight quantization) have been proposed to address this issue, they suffer from either low compression ratios or high compression errors, which may introduce an expensive fine-tuning overhead (i.e., a costly retraining process for the target inference accuracy). In this paper, we propose DeepSZ: an accuracy-loss expected neural network compression framework, which involves four key steps: network pruning, error bound assessment, optimization for error bound configuration, and compressed model generation, featuring a high compression ratio and low encoding time. The contribution is threefold. (1)We develop an adaptive approach to select the feasible error bounds for each layer. (2) We build a model to estimate the overall loss of inference accuracy based on the inference accuracy degradation caused by individual decompressed layers. (3) We develop an efficient optimization algorithm to determine the best-fit configuration of error bounds in order to maximize the compression ratio under the user-set inference accuracy constraint. Experiments show that DeepSZ can compress AlexNet and VGG-16 on the ImageNet dataset by a compression ratio of 46\texttimes{} and 116\texttimes{}, respectively, and compress LeNet-300-100 and LeNet-5 on the MNIST dataset by a compression ratio of 57\texttimes{} and 56\texttimes{}, respectively, with only up to 0.3% loss of inference accuracy. Compared with other state-of-the-art methods, DeepSZ can improve the compression ratio by up to 1.43\texttimes{}, the DNN encoding performance by up to 4.0\texttimes{} with four V100 GPUs, and the decoding performance by up to 6.2\texttimes{}.},
booktitle = {Proceedings of the 28th International Symposium on High-Performance Parallel and Distributed Computing},
pages = {159–170},
numpages = {12},
keywords = {performance, neural networks, lossy compression, deep learning},
location = {Phoenix, AZ, USA},
series = {HPDC '19}
}
@article{2019-zhang-snap,
title={SNAP: A 1.67 21.55TOPS/W Sparse Neural Acceleration Processor for Unstructured Sparse Deep Neural Network Inference in 16nm CMOS},
author={Jie-Fang Zhang and Ching-En Lee and C. Liu and Y. Shao and Stephen W. Keckler and Zhengya Zhang},
journal={2019 Symposium on VLSI Circuits},
year={2019},
pages={C306-C307}
}
@INPROCEEDINGS{2012-yu,
author={D. {Yu} and F. {Seide} and G. {Li} and L. {Deng}},
booktitle={2012 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
title={Exploiting sparseness in deep neural networks for large vocabulary speech recognition},
year={2012},
volume={},
number={},
pages={4409-4412},
doi={10.1109/ICASSP.2012.6288897}}
@inproceedings{10.5555/2986916.2987033,
author = {Krogh, Anders and Hertz, John A.},
title = {A Simple Weight Decay Can Improve Generalization},
year = {1991},
isbn = {1558602224},
publisher = {Morgan Kaufmann Publishers Inc.},
address = {San Francisco, CA, USA},
abstract = {It has been observed in numerical simulations that a weight decay can improve generalization in a feed-forward neural network. This paper explains why. It is proven that a weight decay has two effects in a linear network. First, it suppresses any irrelevant components of the weight vector by choosing the smallest vector that solves the learning problem. Second, if the size is chosen right, a weight decay can suppress some of the effects of static noise on the targets, which improves generalization quite a lot. It is then shown how to extend these results to networks with hidden layers and non-linear units. Finally the theory is confirmed by some numerical simulations using the data from NetTalk.},
booktitle = {Proceedings of the 4th International Conference on Neural Information Processing Systems},
pages = {950–957},
numpages = {8},
location = {Denver, Colorado},
series = {NIPS'91}
}
@misc{2019-niu,
title={SPEC2: SPECtral SParsE CNN Accelerator on FPGAs},
author={Yue Niu and Hanqing Zeng and Ajitesh Srivastava and Kartik Lakhotia and Rajgopal Kannan and Yanzhi Wang and Viktor Prasanna},
year={2019},
eprint={1910.11103},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@inproceedings{rasmussen2001occam,
title={Occam's razor},
author={Rasmussen, Carl Edward and Ghahramani, Zoubin},
booktitle={Advances in neural information processing systems},
pages={294--300},
year={2001}
}
@misc{2018-zhu,
title={SparseNN: An Energy-Efficient Neural Network Accelerator Exploiting Input and Output Sparsity},
author={Jingyang Zhu and Jingbo Jiang and Xizi Chen and Chi-Ying Tsui},
year={2017},
eprint={1711.01263},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{issr,
title={Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra},
author={Paul Scheffler and Florian Zaruba and Fabian Schuiki and Torsten Hoefler and Luca Benini},
year={2020},
eprint={2011.08070},
archivePrefix={arXiv},
primaryClass={cs.AR}
}
@inproceedings{2020-hegde,
author = {Hegde, Kartik and Asghari-Moghaddam, Hadi and Pellauer, Michael and Crago, Neal and Jaleel, Aamer and Solomonik, Edgar and Emer, Joel and Fletcher, Christopher W.},
title = {ExTensor: An Accelerator for Sparse Tensor Algebra},
year = {2019},
isbn = {9781450369381},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3352460.3358275},
doi = {10.1145/3352460.3358275},
abstract = {Generalized tensor algebra is a prime candidate for acceleration via customized ASICs. Modern tensors feature a wide range of data sparsity, with the density of non-zero elements ranging from 10-6% to 50%. This paper proposes a novel approach to accelerate tensor kernels based on the principle of hierarchical elimination of computation in the presence of sparsity. This approach relies on rapidly finding intersections---situations where both operands of a multiplication are non-zero---enabling new data fetching mechanisms and avoiding memory latency overheads associated with sparse kernels implemented in software.We propose the ExTensor accelerator, which builds these novel ideas on handling sparsity into hardware to enable better bandwidth utilization and compute throughput. We evaluate ExTensor on several kernels relative to industry libraries (Intel MKL) and state-of-the-art tensor algebra compilers (TACO). When bandwidth normalized, we demonstrate an average speedup of 3.4\texttimes{}, 1.3\texttimes{}, 2.8\texttimes{}, 24.9\texttimes{}, and 2.7\texttimes{} on SpMSpM, SpMM, TTV, TTM, and SDDMM kernels respectively over a server class CPU.},
booktitle = {Proceedings of the 52nd Annual IEEE/ACM International Symposium on Microarchitecture},
pages = {319–333},
numpages = {15},
keywords = {Tensor Algebra, Sparse Computation, Hardware Acceleration},
location = {Columbus, OH, USA},
series = {MICRO '52}
}
@INPROCEEDINGS{2017-hill,
author={P. {Hill} and A. {Jain} and M. {Hill} and B. {Zamirai} and C. {Hsu} and M. A. {Laurenzano} and S. {Mahlke} and L. {Tang} and J. {Mars}},
booktitle={2017 50th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)},
title={DeftNN: Addressing Bottlenecks for DNN Execution on GPUs via Synapse Vector Elimination and Near-compute Data Fission},
year={2017},
volume={},
number={},
pages={786-799},
doi={}}
@ARTICLE{2017-kim,
author={D. {Kim} and J. {Ahn} and S. {Yoo}},
journal={IEEE Design Test},
title={ZeNA: Zero-Aware Neural Network Accelerator},
year={2018},
volume={35},
number={1},
pages={39-46},
doi={10.1109/MDAT.2017.2741463}}
@ARTICLE{2019-li,
author={J. {Li} and S. {Jiang} and S. {Gong} and J. {Wu} and J. {Yan} and G. {Yan} and X. {Li}},
journal={IEEE Transactions on Computers},
title={SqueezeFlow: A Sparse CNN Accelerator Exploiting Concise Convolution Rules},
year={2019},
volume={68},
number={11},
pages={1663-1677},
doi={10.1109/TC.2019.2924215}}
@inproceedings{2019-zhang-eager,
author = {Zhang, Jiaqi and Chen, Xiangru and Song, Mingcong and Li, Tao},
title = {Eager Pruning: Algorithm and Architecture Support for Fast Training of Deep Neural Networks},
year = {2019},
isbn = {9781450366694},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3307650.3322263},
doi = {10.1145/3307650.3322263},
abstract = {Today's big and fast data and the changing circumstance require fast training of Deep Neural Networks (DNN) in various applications. However, training a DNN with tons of parameters involves intensive computation. Enlightened by the fact that redundancy exists in DNNs and the observation that the ranking of the significance of the weights changes slightly during training, we propose Eager Pruning, which speeds up DNN training by moving pruning to an early stage.Eager Pruning is supported by an algorithm and architecture co-design. The proposed algorithm dictates the architecture to identify and prune insignificant weights during training without accuracy loss. A novel architecture is designed to transform the reduced training computation into performance improvement. Our proposed Eager Pruning system gains an average of 1.91x speedup over state-of-the-art hardware accelerator and 6.31x energy-efficiency over Nvidia GPUs.},
booktitle = {Proceedings of the 46th International Symposium on Computer Architecture},
pages = {292–303},
numpages = {12},
keywords = {neural network training, neural network pruning, software-hardware co-design},
location = {Phoenix, Arizona},
series = {ISCA '19}
}
@misc{2018-kung,
title={Packing Sparse Convolutional Neural Networks for Efficient Systolic Array Implementations: Column Combining Under Joint Optimization},
author={H. T. Kung and Bradley McDanel and Sai Qian Zhang},
year={2018},
eprint={1811.04770},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@INPROCEEDINGS{2016-albericio,
author={J. {Albericio} and P. {Judd} and T. {Hetherington} and T. {Aamodt} and N. E. {Jerger} and A. {Moshovos}},
booktitle={2016 ACM/IEEE 43rd Annual International Symposium on Computer Architecture (ISCA)},
title={Cnvlutin: Ineffectual-Neuron-Free Deep Neural Network Computing},
year={2016},
volume={},
number={},
pages={1-13},
doi={10.1109/ISCA.2016.11}}
@misc{a100,
title={NVIDIA A100 Tensor Core GPU Architecture},
author={{Nvidia}},
year={2020},
}
@book{UsingAdvancedMPI,
author={William Gropp and Torsten Hoefler and Rajeev Thakur and E. Lusk},
title={{Using Advanced MPI: Modern Features of the Message-Passing Interface}},
year={2014},
month={Nov.},
location={Cambridge, MA},
publisher={MIT Press},
isbn={978-0262527637},
source={http://www.unixer.de/~htor/publications/},
}
@inproceedings{gropp-datatype-performance,
author={William Gropp and Torsten Hoefler and Rajeev Thakur and Jesper Larsson Träff},
title={{Performance Expectations and Guidelines for MPI Derived Datatypes}},
year={2011},
month={Sep.},
pages={150-159},
volume={6960},
booktitle={Recent Advances in the Message Passing Interface (EuroMPI'11)},
location={Santorini, Greece},
publisher={Springer},
isbn={978-3-642-24448-3},
source={http://www.unixer.de/~htor/publications/},
}
@misc{2020-zhang,
title={SpArch: Efficient Architecture for Sparse Matrix Multiplication},
author={Zhekai Zhang and Hanrui Wang and Song Han and William J. Dally},
year={2020},
eprint={2002.08947},
archivePrefix={arXiv},
primaryClass={cs.AR}
}
@misc{2016-park,
title={Faster CNNs with Direct Sparse Convolutions and Guided Pruning},
author={Jongsoo Park and Sheng Li and Wei Wen and Ping Tak Peter Tang and Hai Li and Yiran Chen and Pradeep Dubey},
year={2017},
eprint={1608.01409},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@article{1996-olshausen,
title={Emergence of simple-cell receptive field properties by learning a sparse code for natural images},
author={Olshausen, Bruno A and Field, David J},
journal={Nature},
volume={381},
number={6583},
pages={607--609},
year={1996},
publisher={Nature Publishing Group}
}
@article{1989-janowsky,
title={Pruning versus clipping in neural networks},
author={Janowsky, Steven A},
journal={Physical Review A},
volume={39},
number={12},
pages={6600},
year={1989},
publisher={APS}
}
@INPROCEEDINGS{7780804,
author={A. {Lavin} and S. {Gray}},
booktitle={2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
title={Fast Algorithms for Convolutional Neural Networks},
year={2016},
volume={},
number={},
pages={4013-4021},
doi={10.1109/CVPR.2016.435}}
@inproceedings{2020-gondimalla,
author = {Gondimalla, Ashish and Chesnut, Noah and Thottethodi, Mithuna and Vijaykumar, T. N.},
title = {SparTen: A Sparse Tensor Accelerator for Convolutional Neural Networks},
year = {2019},
isbn = {9781450369381},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3352460.3358291},
doi = {10.1145/3352460.3358291},
abstract = {Convolutional neural networks (CNNs) are emerging as powerful tools for image processing. Recent machine learning work has reduced CNNs' compute and data volumes by exploiting the naturally-occurring and actively-transformed zeros in the feature maps and filters. While previous semi-sparse architectures exploit one-sided sparsity either in the feature maps or the filters, but not both, a recent fully-sparse architecture, called Sparse CNN (SCNN), exploits two-sided sparsity to improve performance and energy over dense architectures. However, sparse vector-vector dot product, a key primitive in sparse CNNs, would be inefficient using the representation adopted by SCNN. The dot product requires finding and accessing non-zero elements in matching positions in the two sparse vectors -- an inner join using the position as the key with a single value field. SCNN avoids the inner join by performing a Cartesian product capturing the relevant multiplications. However, SCNN's approach incurs several considerable overheads and is not applicable to non-unit-stride convolutions. Further, exploiting reuse in sparse CNNs fundamentally causes systematic load imbalance not addressed by SCNN. We propose SparTen which achieves efficient inner join by providing support for native two-sided sparse execution and memory storage. To tackle load imbalance, SparTen employs a software scheme, called greedy balancing, which groups filters by density via two variants, a software-only one which uses whole-filter density and a software-hardware hybrid which uses finer-grain density. Our simulations show that, on average, SparTen performs 4.7x, 1.8x, and 3x better than a dense architecture, one-sided sparse architecture, and SCNN, respectively. An FPGA implementation shows that SparTen performs 4.3x and 1.9x better than a dense architecture and a one-sided sparse architecture, respectively.},
booktitle = {Proceedings of the 52nd Annual IEEE/ACM International Symposium on Microarchitecture},
pages = {151–165},
numpages = {15},
keywords = {Convolutional neural networks, Accelerators, Sparse tensors},
location = {Columbus, OH, USA},
series = {MICRO '52}
}
@misc{2020-yang,
title={Procrustes: a Dataflow and Accelerator for Sparse Deep Neural Network Training},
author={Dingqing Yang and Amin Ghasemazar and Xiaowei Ren and Maximilian Golub and Guy Lemieux and Mieszko Lis},
year={2020},
eprint={2009.10976},
archivePrefix={arXiv},
primaryClass={cs.NE}
}
@INPROCEEDINGS{2018-zhou,
author={X. {Zhou} and Z. {Du} and Q. {Guo} and S. {Liu} and C. {Liu} and C. {Wang} and X. {Zhou} and L. {Li} and T. {Chen} and Y. {Chen}},
booktitle={2018 51st Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)},
title={Cambricon-S: Addressing Irregularity in Sparse Neural Networks through A Cooperative Software/Hardware Approach},
year={2018},
volume={},
number={},
pages={15-28},
doi={10.1109/MICRO.2018.00011}}
@INPROCEEDINGS{2016-zhu,
author={ Jingyang Zhu and Zhiliang Qian and Chi-Ying Tsui},
booktitle={2016 21st Asia and South Pacific Design Automation Conference (ASP-DAC)},
title={LRADNN: High-throughput and energy-efficient Deep Neural Network accelerator using Low Rank Approximation},
year={2016},
volume={},
number={},
pages={581-586},
doi={10.1109/ASPDAC.2016.7428074}}
@INPROCEEDINGS{2016-zhang,
author={S. {Zhang} and Z. {Du} and L. {Zhang} and H. {Lan} and S. {Liu} and L. {Li} and Q. {Guo} and T. {Chen} and Y. {Chen}},
booktitle={2016 49th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)},
title={Cambricon-X: An accelerator for sparse neural networks},
year={2016},
volume={},
number={},
pages={1-12},
doi={10.1109/MICRO.2016.7783723}}
@inproceedings{2020-niu,
author = {Niu, Yue and Kannan, Rajgopal and Srivastava, Ajitesh and Prasanna, Viktor},
title = {Reuse Kernels or Activations? A Flexible Dataflow for Low-Latency Spectral CNN Acceleration},
year = {2020},
isbn = {9781450370998},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3373087.3375302},
doi = {10.1145/3373087.3375302},
abstract = {Spectral-domain CNNs have been shown to be more efficient than traditional spatial CNNs in terms of reducing computation complexity. However they come with a 'kernel explosion' problem that, even after compression (pruning), imposes a high memory burden and off-chip bandwidth requirement for kernel access. This creates a performance gap between the potential acceleration offered by compression and actual FPGA implementation performance, especially for low-latency CNN inference. In this paper, we develop a principled approach to overcoming this performance gap and designing a low-latency, low-bandwidth, spectral sparse CNN accelerator on FPGAs. First, we analyze the bandwidth-storage tradeoff of sparse convolutional layers and locate communication bottlenecks. We then develop a dataflow for flexibly optimizing data reuse in different layers to minimize off-chip communication. Finally, we propose a novel scheduling algorithm to optimally schedule the on-chip memory access of multiple sparse kernels and minimize read conflicts. On a state-of-the-art FPGA platform, our design reduces data transfers by 42% with DSP utilization up to 90% and achieves inference latency of 9 ms for VGG16, compared to the baseline state-of-the-art latency of 68 ms.},
booktitle = {Proceedings of the 2020 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},
pages = {266–276},
numpages = {11},
keywords = {sparse operation, flexible dataflow, spectral cnns, accelerator},
location = {Seaside, CA, USA},
series = {FPGA '20}
}
@misc{2019-golub,
title={Full deep neural network training on a pruned weight budget},
author={Maximilian Golub and Guy Lemieux and Mieszko Lis},
year={2019},
eprint={1806.06949},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{2019-gupta,
title={MASR: A Modular Accelerator for Sparse RNNs},
author={Udit Gupta and Brandon Reagen and Lillian Pentecost and Marco Donato and Thierry Tambe and Alexander M. Rush and Gu-Yeon Wei and David Brooks},
year={2019},
eprint={1908.08976},
archivePrefix={arXiv},
primaryClass={eess.SP}
}
@misc{2017-mao,
title={Exploring the Regularity of Sparse Structure in Convolutional Neural Networks},
author={Huizi Mao and Song Han and Jeff Pool and Wenshuo Li and Xingyu Liu and Yu Wang and William J. Dally},
year={2017},
eprint={1705.08922},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@ARTICLE{2016-chen,
author={Y. {Chen} and T. {Krishna} and J. S. {Emer} and V. {Sze}},
journal={IEEE Journal of Solid-State Circuits},
title={Eyeriss: An Energy-Efficient Reconfigurable Accelerator for Deep Convolutional Neural Networks},
year={2017},
volume={52},
number={1},
pages={127-138},
doi={10.1109/JSSC.2016.2616357}}
@misc{noh2015learning,
title={Learning Deconvolution Network for Semantic Segmentation},
author={Hyeonwoo Noh and Seunghoon Hong and Bohyung Han},
year={2015},
eprint={1505.04366},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{2019-chen,
title={Eyeriss v2: A Flexible Accelerator for Emerging Deep Neural Networks on Mobile Devices},
author={Yu-Hsin Chen and Tien-Ju Yang and Joel Emer and Vivienne Sze},
year={2019},
eprint={1807.07928},
archivePrefix={arXiv},
primaryClass={cs.DC}
}
@misc{goodfellow2014generative,
title={Generative Adversarial Networks},
author={Ian J. Goodfellow and Jean Pouget-Abadie and Mehdi Mirza and Bing Xu and David Warde-Farley and Sherjil Ozair and Aaron Courville and Yoshua Bengio},
year={2014},
eprint={1406.2661},
archivePrefix={arXiv},
primaryClass={stat.ML}
}
@INPROCEEDINGS{2016-reagen,
author={B. {Reagen} and P. {Whatmough} and R. {Adolf} and S. {Rama} and H. {Lee} and S. K. {Lee} and J. M. {Hernández-Lobato} and G. {Wei} and D. {Brooks}},
booktitle={2016 ACM/IEEE 43rd Annual International Symposium on Computer Architecture (ISCA)},
title={Minerva: Enabling Low-Power, Highly-Accurate Deep Neural Network Accelerators},
year={2016},
volume={},
number={},
pages={267-278},
doi={10.1109/ISCA.2016.32}}
@inproceedings{2018-rhu,
title={Compressing DMA engine: Leveraging activation sparsity for training deep neural networks},
author={Rhu, Minsoo and O'Connor, Mike and Chatterjee, Niladrish and Pool, Jeff and Kwon, Youngeun and Keckler, Stephen W},
booktitle={2018 IEEE International Symposium on High Performance Computer Architecture (HPCA)},
pages={78--91},
year={2018},
organization={IEEE}
}
@misc{2017-parashar,
title={SCNN: An Accelerator for Compressed-sparse Convolutional Neural Networks},
author={Angshuman Parashar and Minsoo Rhu and Anurag Mukkara and Antonio Puglielli and Rangharajan Venkatesan and Brucek Khailany and Joel Emer and Stephen W. Keckler and William J. Dally},
year={2017},
eprint={1708.04485},
archivePrefix={arXiv},
primaryClass={cs.NE}
}
@misc{dave2020hardware,
title={Hardware Acceleration of Sparse and Irregular Tensor Computations of ML Models: A Survey and Insights},
author={Shail Dave and Riyadh Baghdadi and Tony Nowatzki and Sasikanth Avancha and Aviral Shrivastava and Baoxin Li},
year={2020},
eprint={2007.00864},
archivePrefix={arXiv},
primaryClass={cs.AR}
}
@article{2014-collins,
author = {Maxwell D. Collins and
Pushmeet Kohli},
title = {Memory Bounded Deep Convolutional Networks},
journal = {CoRR},
volume = {abs/1412.1442},
year = {2014},
url = {http://arxiv.org/abs/1412.1442},
archivePrefix = {arXiv},
eprint = {1412.1442},
timestamp = {Mon, 13 Aug 2018 16:47:16 +0200},
biburl = {https://dblp.org/rec/journals/corr/CollinsK14.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@misc{2016-han-ese,
title={ESE: Efficient Speech Recognition Engine with Sparse LSTM on FPGA},
author={Song Han and Junlong Kang and Huizi Mao and Yiming Hu and Xin Li and Yubin Li and Dongliang Xie and Hong Luo and Song Yao and Yu Wang and Huazhong Yang and William J. Dally},
year={2017},
eprint={1612.00694},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{2016-see,
title={Compression of Neural Machine Translation Models via Pruning},
author={Abigail See and Minh-Thang Luong and Christopher D. Manning},
year={2016},
eprint={1606.09274},
archivePrefix={arXiv},
primaryClass={cs.AI}
}
@misc{2016-wen,
title={Learning Structured Sparsity in Deep Neural Networks},
author={Wei Wen and Chunpeng Wu and Yandan Wang and Yiran Chen and Hai Li},
year={2016},
eprint={1608.03665},
archivePrefix={arXiv},
primaryClass={cs.NE}
}
@misc{mdl,
title={A tutorial introduction to the minimum description length principle},
author={Grunwald, Peter},
year={2004},
eprint={math/0406077},
archivePrefix={arXiv}
}
@misc{denil2014predicting,
title={Predicting Parameters in Deep Learning},
author={Misha Denil and Babak Shakibi and Laurent Dinh and Marc'Aurelio Ranzato and Nando de Freitas},
year={2014},
eprint={1306.0543},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{2020-savarese,
title={Winning the Lottery with Continuous Sparsification},
author={Pedro Savarese and Hugo Silva and Michael Maire},
year={2020},
eprint={1912.04427},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{neyshabur2018understanding,
title={Towards Understanding the Role of Over-Parametrization in Generalization of Neural Networks},
author={Behnam Neyshabur and Zhiyuan Li and Srinadh Bhojanapalli and Yann LeCun and Nathan Srebro},
year={2018},
eprint={1805.12076},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{allenzhu2019convergence,
title={A Convergence Theory for Deep Learning via Over-Parameterization},
author={Zeyuan Allen-Zhu and Yuanzhi Li and Zhao Song},
year={2019},
eprint={1811.03962},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{2017-luo,
title={ThiNet: A Filter Level Pruning Method for Deep Neural Network Compression},
author={Jian-Hao Luo and Jianxin Wu and Weiyao Lin},
year={2017},
eprint={1707.06342},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{2017-han,
title={DSD: Dense-Sparse-Dense Training for Deep Neural Networks},
author={Song Han and Jeff Pool and Sharan Narang and Huizi Mao and Enhao Gong and Shijian Tang and Erich Elsen and Peter Vajda and Manohar Paluri and John Tran and Bryan Catanzaro and William J. Dally},
year={2017},
eprint={1607.04381},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{2017-neklyudov,
title={Structured Bayesian Pruning via Log-Normal Multiplicative Noise},
author={Kirill Neklyudov and Dmitry Molchanov and Arsenii Ashukha and Dmitry Vetrov},
year={2017},
eprint={1705.07283},
archivePrefix={arXiv},
primaryClass={stat.ML}
}
@misc{2020-azarian,
title={Learned Threshold Pruning},
author={Kambiz Azarian and Yash Bhalgat and Jinwon Lee and Tijmen Blankevoort},
year={2020},
eprint={2003.00075},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{2017-srinivas,
title={Training Sparse Neural Networks},
author={Suraj Srinivas and Akshayvarun Subramanya and R. Venkatesh Babu},
year={2016},
eprint={1611.06694},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{2017-he,
title={Channel Pruning for Accelerating Very Deep Neural Networks},
author={Yihui He and Xiangyu Zhang and Jian Sun},
year={2017},
eprint={1707.06168},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@ARTICLE{2018-dey,
author={S. {Dey} and K. {Huang} and P. A. {Beerel} and K. M. {Chugg}},
journal={IEEE Journal on Emerging and Selected Topics in Circuits and Systems},
title={Pre-Defined Sparse Neural Networks With Hardware Acceleration},
year={2019},
volume={9},
number={2},
pages={332-345},
doi={10.1109/JETCAS.2019.2910864}}
@misc{2017-bourely,
title={Sparse Neural Networks Topologies},
author={Alfred Bourely and John Patrick Boueri and Krzysztof Choromonski},
year={2017},
eprint={1706.05683},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{2017-prabhu,
title={Deep Expander Networks: Efficient Deep Networks from Graph Theory},
author={Ameya Prabhu and Girish Varma and Anoop Namboodiri},
year={2018},
eprint={1711.08757},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@article{2018-manessi,
title={Automated Pruning for Deep Neural Network Compression},
ISBN={9781538637883},
url={http://dx.doi.org/10.1109/ICPR.2018.8546129},
DOI={10.1109/icpr.2018.8546129},
journal={2018 24th International Conference on Pattern Recognition (ICPR)},
publisher={IEEE},
author={Manessi, Franco and Rozza, Alessandro and Bianco, Simone and Napoletano, Paolo and Schettini, Raimondo},
year={2018},
month={Aug}
}
@misc{2017-yang,
title={Designing Energy-Efficient Convolutional Neural Networks using Energy-Aware Pruning},
author={Tien-Ju Yang and Yu-Hsin Chen and Vivienne Sze},
year={2017},
eprint={1611.05128},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{2017-liu,
title={Learning Efficient Convolutional Networks through Network Slimming},
author={Zhuang Liu and Jianguo Li and Zhiqiang Shen and Gao Huang and Shoumeng Yan and Changshui Zhang},
year={2017},
eprint={1708.06519},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{2017-ullrich,
title={Soft Weight-Sharing for Neural Network Compression},
author={Karen Ullrich and Edward Meeds and Max Welling},
year={2017},
eprint={1702.04008},
archivePrefix={arXiv},
primaryClass={stat.ML}
}
@misc{2016-jin,
title={Training Skinny Deep Neural Networks with Iterative Hard Thresholding Methods},
author={Xiaojie Jin and Xiaotong Yuan and Jiashi Feng and Shuicheng Yan},
year={2016},
eprint={1607.05423},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@article{2019-zhang-compact,
author = {Zhang, Jeff (Jun) and Raj, Parul and Zarar, Shuayb and Ambardekar, Amol and Garg, Siddharth},
title = {CompAct: On-Chip ComPression of ActIvations for Low Power Systolic Array Based CNN Acceleration},
year = {2019},
issue_date = {October 2019},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {18},
number = {5s},
issn = {1539-9087},
url = {https://doi.org/10.1145/3358178},
doi = {10.1145/3358178},
abstract = {This paper addresses the design of systolic array (SA) based convolutional neural network (CNN) accelerators for mobile and embedded domains. On- and off-chip memory accesses to the large activation inputs (sometimes called feature maps) of CNN layers contribute significantly to total energy consumption for such accelerators; while prior has proposed off-chip compression, activations are still stored on-chip in uncompressed form, requiring either large on-chip activation buffers or slow and energy-hungry off-chip accesses. In this paper, we propose CompAct, a new architecture that enables on-chip compression of activations for SA based CNN accelerators. CompAct is built around several key ideas. First, CompAct identifies an SA schedule that has nearly regular access patterns, enabling the use of a modified run-length coding scheme (RLC). Second, CompAct improves compression ratio of the RLC scheme using Sparse-RLC in later CNN layers and Lossy-RLC in earlier layers. Finally, CompAct proposes look-ahead snoozing that operates synergistically with RLC to reduce the leakage energy of activation buffers. Based on detailed synthesis results, we show that CompAct enables up to 62% reduction in activation buffer energy, and 34% reduction in total chip energy.},
journal = {ACM Trans. Embed. Comput. Syst.},
month = oct,
articleno = {47},
numpages = {24},
keywords = {systolic arrays, low-power design, Deep neural networks}
}
@article{2016-scardapane,
title = "Group sparse regularization for deep neural networks",
journal = "Neurocomputing",
volume = "241",
pages = "81 - 89",
year = "2017",
issn = "0925-2312",
doi = "https://doi.org/10.1016/j.neucom.2017.02.029",
url = "http://www.sciencedirect.com/science/article/pii/S0925231217302990",
author = "Simone Scardapane and Danilo Comminiello and Amir Hussain and Aurelio Uncini",
keywords = "Deep networks, Group sparsity, Pruning, Feature selection",
abstract = "In this paper, we address the challenging task of simultaneously optimizing (i) the weights of a neural network, (ii) the number of neurons for each hidden layer, and (iii) the subset of active input features (i.e., feature selection). While these problems are traditionally dealt with separately, we propose an efficient regularized formulation enabling their simultaneous parallel execution, using standard optimization routines. Specifically, we extend the group Lasso penalty, originally proposed in the linear regression literature, to impose group-level sparsity on the network’s connections, where each group is defined as the set of outgoing weights from a unit. Depending on the specific case, the weights can be related to an input variable, to a hidden neuron, or to a bias unit, thus performing simultaneously all the aforementioned tasks in order to obtain a compact network. We carry out an extensive experimental evaluation, in comparison with classical weight decay and Lasso penalties, both on a toy dataset for handwritten digit recognition, and multiple realistic mid-scale classification benchmarks. Comparative results demonstrate the potential of our proposed sparse group Lasso penalty in producing extremely compact networks, with a significantly lower number of input features, with a classification accuracy which is equal or only slightly inferior to standard regularization terms."
}
@misc{2017-changpinyo,
title={The Power of Sparsity in Convolutional Neural Networks},
author={Soravit Changpinyo and Mark Sandler and Andrey Zhmoginov},
year={2017},
eprint={1702.06257},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@inproceedings{2016-zhou,
title={Less is more: Towards compact cnns},
author={Zhou, Hao and Alvarez, Jose M and Porikli, Fatih},
booktitle={European Conference on Computer Vision},
pages={662--677},
year={2016},
organization={Springer}
}
@inproceedings{im2col,
title={High performance convolutional neural networks for document processing},
author={Chellapilla, Kumar and Puri, Sidd and Simard, Patrice},
year={2006}
}
@misc{2015-lebedev,
title={Fast ConvNets Using Group-wise Brain Damage},
author={Vadim Lebedev and Victor Lempitsky},
year={2015},
eprint={1506.02515},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{2018-mittal,
title={Recovering from Random Pruning: On the Plasticity of Deep Convolutional Neural Networks},
author={Deepak Mittal and Shweta Bhardwaj and Mitesh M. Khapra and Balaraman Ravindran},
year={2018},
eprint={1801.10447},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{2015-tompson,
title={Efficient Object Localization Using Convolutional Networks},
author={Jonathan Tompson and Ross Goroshin and Arjun Jain and Yann LeCun and Christopher Bregler},
year={2015},
eprint={1411.4280},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{2015-mariet,
title={Diversity Networks: Neural Network Compression Using Determinantal Point Processes},
author={Zelda Mariet and Suvrit Sra},
year={2017},
eprint={1511.05077},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@ARTICLE{2015-polyak,
author={A. {Polyak} and L. {Wolf}},
journal={IEEE Access},
title={Channel-level acceleration of deep face representations},
year={2015},
volume={3},
number={},
pages={2163-2175},
doi={10.1109/ACCESS.2015.2494536}}
@article{2015-anwar,
title={Structured pruning of deep convolutional neural networks},
author={Anwar, Sajid and Hwang, Kyuyeon and Sung, Wonyong},
journal={ACM Journal on Emerging Technologies in Computing Systems (JETC)},
volume={13},
number={3},
pages={1--18},
year={2017},
publisher={ACM New York, NY, USA}
}
@misc{2015-srinivas-relu,
title={Learning Neural Network Architectures using Backpropagation},
author={Suraj Srinivas and R. Venkatesh Babu},
year={2016},
eprint={1511.05497},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@inproceedings{2011-glorot,
title={Deep sparse rectifier neural networks},
author={Glorot, Xavier and Bordes, Antoine and Bengio, Yoshua},
booktitle={Proceedings of the fourteenth international conference on artificial intelligence and statistics},
pages={315--323},
year={2011}
}
@inproceedings{2010-glorot-init,
added-at = {2019-05-29T00:00:00.000+0200},
author = {Glorot, Xavier and Bengio, Yoshua},
biburl = {https://www.bibsonomy.org/bibtex/221d2d1490c8404f823f1d36b294fce72/dblp},
booktitle = {AISTATS},
editor = {Teh, Yee Whye and Titterington, D. Mike},
ee = {http://proceedings.mlr.press/v9/glorot10a.html},
interhash = {4f45a520bb65b6045bd237963ffee0ed},
intrahash = {21d2d1490c8404f823f1d36b294fce72},
keywords = {dblp},
pages = {249-256},
publisher = {JMLR.org},
series = {JMLR Proceedings},
timestamp = {2019-05-30T11:50:49.000+0200},
title = {Understanding the difficulty of training deep feedforward neural networks.},
url = {http://dblp.uni-trier.de/db/journals/jmlr/jmlrp9.html#GlorotB10},
volume = 9,
year = 2010
}
@misc{2015-he-init,
title={Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification},
author={Kaiming He and Xiangyu Zhang and Shaoqing Ren and Jian Sun},
year={2015},
eprint={1502.01852},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@article{2008-narasimha,
title = "An integrated growing-pruning method for feedforward network training",
journal = "Neurocomputing",
volume = "71",
number = "13",
pages = "2831 - 2847",
year = "2008",
note = "Artificial Neural Networks (ICANN 2006) / Engineering of Intelligent Systems (ICEIS 2006)",
issn = "0925-2312",
doi = "https://doi.org/10.1016/j.neucom.2007.08.026",
url = "http://www.sciencedirect.com/science/article/pii/S0925231207003086",
author = "Pramod L. Narasimha and Walter H. Delashmit and Michael T. Manry and Jiang Li and Francisco Maldonado",
keywords = "Growing, Pruning, Cascade correlation, Back propagation, Output weight optimization–Hidden weight optimization",
abstract = "In order to facilitate complexity optimization in feedforward networks, several algorithms are developed that combine growing and pruning. First, a growing scheme is presented which iteratively adds new hidden units to full-trained networks. Then, a non-heuristic one-pass pruning technique is presented, which utilizes orthogonal least squares. Based upon pruning, a one-pass approach is developed for generating the validation error versus network size curve. A combined approach is described in which networks are continually pruned during the growing process. As a result, the hidden units are ordered according to their usefulness, and the least useful units are eliminated. Examples show that networks designed using the combined method have less training and validation error than growing or pruning alone. The combined method exhibits reduced sensitivity to the initial weights and generates an almost monotonic error versus network size curve. It is shown to perform better than two well-known growing methods—constructive backpropagation and cascade correlation."
}
@article{1997-prechelt,
title = "Connection pruning with static and adaptive pruning schedules",
journal = "Neurocomputing",
volume = "16",
number = "1",
pages = "49 - 61",
year = "1997",
issn = "0925-2312",
doi = "https://doi.org/10.1016/S0925-2312(96)00054-9",
url = "http://www.sciencedirect.com/science/article/pii/S0925231296000549",
author = "Lutz Prechelt",
keywords = "Empirical study, Pruning, Early stopping, Generalization",
abstract = "Neural network pruning methods on the level of individual network parameters (e.g. connection weights) can improve generalization, as is shown in this empirical study. However, an open problem in the pruning methods known today (e.g. OBD, OBS, autoprune, epsiprune) is the selection of the number of parameters to be removed in each pruning step (pruning strength). This work presents a pruning method lprune that automatically adapts the pruning strength to the evolution of weights and loss of generalization during training. The method requires no algorithm parameter adjustment by the user. Results of statistical significance tests comparing autoprune, lprune, and static networks with early stopping are given, based on extensive experimentation with 14 different problems. The results indicate that training with pruning is often significantly better and rarely significantly worse than training with early stopping without pruning. Furthermore, lprune is often superior to autoprune (which is superior to OBD) on diagnosis tasks unless severe pruning early in the training process is required."
}
@article{1996-cibas,
title = "Variable selection with neural networks",
journal = "Neurocomputing",
volume = "12",
number = "2",
pages = "223 - 248",
year = "1996",
note = "Current European Neurocomputing Research",
issn = "0925-2312",
doi = "https://doi.org/10.1016/0925-2312(95)00121-2",
url = "http://www.sciencedirect.com/science/article/pii/0925231295001212",
author = "Tautvydas Cibas and Françoise Fogelman Soulié and Patrick Gallinari and Sarunas Raudys",
keywords = "Variable selection, Regularization, Neural network pruning, Dimensionality reduction",
abstract = "In this paper, we present 3 different neural network-based methods to perform variable selection. OCD — Optimal Cell Damage — is a pruning method, which evaluates the usefulness of a variable and prunes the least useful ones (it is related to the Optimal Brain Damage method of Le Cun et al.). Regularization theory proposes to constrain estimators by adding a term to the cost function used to train a neural network. In the Bayesian framework, this additional term can be interpreted as the log prior to the weights distribution. We propose to use two priors (a Gaussian and a Gaussian mixture) and show that this regularization approach allows to select efficient subsets of variables. Our methods are compared to conventional statistical selection procedures and are shown to significantly improve on that."
}
@book{grunwald2007minimum,
title={The minimum description length principle},
author={Gr{\"u}nwald, Peter D and Grunwald, Abhijit},
year={2007},
publisher={MIT press}
}
@INPROCEEDINGS{713928,
author={P. {Burrascano}},
booktitle={Proceedings of 1993 International Conference on Neural Networks (IJCNN-93-Nagoya, Japan)},
title={A pruning technique maximizing generalization},
year={1993},
volume={1},
number={},
pages={347-350 vol.1},
doi={10.1109/IJCNN.1993.713928}}
@inproceedings{NIPS1995_3473decc,
author = {Pedersen, Morten and Hansen, Lars and Larsen, Jan},
booktitle = {Advances in Neural Information Processing Systems},
editor = {D. Touretzky and M. C. Mozer and M. Hasselmo},
pages = {521--527},
publisher = {MIT Press},
title = {Pruning with generalization based weight saliencies: lambda OBD, lambda OBS},
url = {https://proceedings.neurips.cc/paper/1995/file/3473decccb0509fb264818a7512a8b9b-Paper.pdf},
volume = {8},
year = {1996}
}
@INPROCEEDINGS{1993-tamura,
author={S. {Tamura} and M. {Tateishi} and M. {Matumoto} and S. {Akita}},
booktitle={Proceedings of 1993 International Conference on Neural Networks (IJCNN-93-Nagoya, Japan)},
title={Determination of the number of redundant hidden units in a three-layered feedforward neural network},
year={1993},
volume={1},
number={},
pages={335-338 vol.1},
doi={10.1109/IJCNN.1993.713925}}
@inproceedings{10.5555/646365.691221,
author = {White, David and Ligomenides, Panos A.},
title = {GANNet: A Genetic Algorithm for Optimizing Topology and Weights in Neural Network Design},
year = {1993},
isbn = {3540567984},
publisher = {Springer-Verlag},
address = {Berlin, Heidelberg},
booktitle = {Proceedings of the International Workshop on Artificial Neural Networks: New Trends in Neural Computation},
pages = {322–327},
numpages = {6},
series = {IWANN '93}
}
@inproceedings{whitley:ijcnn90,
added-at = {2008-03-11T14:52:34.000+0100},
author = {Whitley, D. and Bogart, C.},
biburl = {https://www.bibsonomy.org/bibtex/2735ce53b3256af7931f2391503645044/idsia},
booktitle = {{P}roceedings of the International Joint Conference on Neural Networks {\rm ({W}ashington, {DC})}},
citeulike-article-id = {2379763},
interhash = {f825f0c086e007e378048c52ec98afa7},
intrahash = {735ce53b3256af7931f2391503645044},
keywords = {nn},
pages = {134--137},
priority = {2},
publisher = {IEEE Press},
timestamp = {2008-03-11T14:57:16.000+0100},
title = {The Evolution of Connectivity: {P}runing Neural Networks Using Genetic Algorithms},
year = 1990
}
@inproceedings{NIPS1988_1c9ac015,
author = {Hanson, Stephen and Pratt, Lorien},
booktitle = {Advances in Neural Information Processing Systems},
editor = {D. Touretzky},
pages = {177--185},
publisher = {Morgan-Kaufmann},
title = {Comparing Biases for Minimal Network Construction with Back-Propagation},
url = {https://proceedings.neurips.cc/paper/1988/file/1c9ac0159c94d8d0cbedc973445af2da-Paper.pdf},
volume = {1},
year = {1989}
}