From 440ad9b20f632206e39442e0803358771786b8e7 Mon Sep 17 00:00:00 2001
From: danjust <daju85@gmail.com>
Date: Mon, 13 May 2019 10:42:48 +0100
Subject: [PATCH] inital commit

---
 ML_lit.bib     | 429 ++++++++++++++++++++++++++++++++++++++++++++++++
 ML_lit.bib.bak | 430 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 859 insertions(+)
 create mode 100644 ML_lit.bib
 create mode 100644 ML_lit.bib.bak

diff --git a/ML_lit.bib b/ML_lit.bib
new file mode 100644
index 0000000..92207dd
--- /dev/null
+++ b/ML_lit.bib
@@ -0,0 +1,429 @@
+% Encoding: UTF-8
+
+@Article{Ben-Nun2018a,
+  author  = {Ben-Nun, Tal and Hoefler, Torsten},
+  title   = {Demystifying Parallel and Distributed Deep Learning: An In-Depth Concurrency Analysis},
+  journal = {arXiv preprint arXiv:1802.09941},
+  year    = {2018},
+  groups  = {Large scale ML, Reviews},
+  url     = {https://arxiv.org/pdf/1802.09941.pdf},
+}
+
+@InProceedings{Krizhevsky2012,
+  author    = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
+  title     = {Imagenet classification with deep convolutional neural networks},
+  booktitle = {Advances in neural information processing systems},
+  year      = {2012},
+  pages     = {1097--1105},
+  groups    = {Classic literature, Computer Vision},
+  url       = {https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf},
+}
+
+@Article{LeCun1989,
+  author    = {LeCun, Yann and Boser, Bernhard and Denker, John S and Henderson, Donnie and Howard, Richard E and Hubbard, Wayne and Jackel, Lawrence D},
+  title     = {Backpropagation applied to handwritten zip code recognition},
+  journal   = {Neural computation},
+  year      = {1989},
+  volume    = {1},
+  number    = {4},
+  pages     = {541--551},
+  groups    = {Classic literature},
+  publisher = {MIT Press},
+}
+
+@Article{Coleman2017,
+  author  = {Coleman, Cody and Narayanan, Deepak and Kang, Daniel and Zhao, Tian and Zhang, Jian and Nardi, Luigi and Bailis, Peter and Olukotun, Kunle and R{\'e}, Chris and Zaharia, Matei},
+  title   = {DAWNBench: An End-to-End Deep Learning Benchmark and Competition},
+  journal = {Training},
+  year    = {2017},
+  volume  = {100},
+  number  = {101},
+  pages   = {102},
+  groups  = {Benchmarking},
+  url     = {http://dawn.cs.stanford.edu/benchmark/papers/nips17-dawnbench.pdf},
+}
+
+@InProceedings{Adolf2016,
+  author       = {Adolf, Robert and Rama, Saketh and Reagen, Brandon and Wei, Gu-Yeon and Brooks, David},
+  title        = {Fathom: Reference workloads for modern deep learning methods},
+  booktitle    = {Workload Characterization (IISWC), 2016 IEEE International Symposium on},
+  year         = {2016},
+  pages        = {1--10},
+  organization = {IEEE},
+  groups       = {Benchmarking},
+  url          = {https://arxiv.org/pdf/1608.06581.pdf},
+}
+
+@Article{Shi2017,
+  author  = {Shi, Shaohuai and Chu, Xiaowen},
+  title   = {Performance Modeling and Evaluation of Distributed Deep Learning Frameworks on GPUs},
+  journal = {arXiv preprint arXiv:1711.05979},
+  year    = {2017},
+  groups  = {Benchmarking},
+  url     = {https://arxiv.org/pdf/1711.05979.pdf},
+}
+
+@Article{Bahrampour2015,
+  author  = {Bahrampour, Soheil and Ramakrishnan, Naveen and Schott, Lukas and Shah, Mohak},
+  title   = {Comparative study of deep learning software frameworks},
+  journal = {arXiv preprint arXiv:1511.06435},
+  year    = {2015},
+  groups  = {Benchmarking},
+  url     = {https://arxiv.org/pdf/1511.06435.pdf},
+}
+
+@InProceedings{Shams2017,
+  author       = {Shams, Shayan and Platania, Richard and Lee, Kisung and Park, Seung-Jong},
+  title        = {Evaluation of deep learning frameworks over different HPC architectures},
+  booktitle    = {Distributed Computing Systems (ICDCS), 2017 IEEE 37th International Conference on},
+  year         = {2017},
+  pages        = {1389--1396},
+  organization = {IEEE},
+  groups       = {Benchmarking},
+  url          = {https://ieeexplore.ieee.org/document/7980078/?reload=true},
+}
+
+@Article{Qi2016,
+  author = {Qi, Hang and Sparks, Evan R and Talwalkar, Ameet},
+  title  = {Paleo: A performance model for deep neural networks},
+  year   = {2016},
+  groups = {Benchmarking},
+  url    = {https://openreview.net/pdf?id=SyVVJ85lg},
+}
+
+@InProceedings{Adolf2016a,
+  author       = {Adolf, Robert and Rama, Saketh and Reagen, Brandon and Wei, Gu-Yeon and Brooks, David},
+  title        = {Fathom: Reference workloads for modern deep learning methods},
+  booktitle    = {Workload Characterization (IISWC), 2016 IEEE International Symposium on},
+  year         = {2016},
+  pages        = {1--10},
+  organization = {IEEE},
+  groups       = {Benchmarking},
+  url          = {https://arxiv.org/pdf/1608.06581.pdf},
+}
+
+@Article{Ioffe2015,
+  author      = {Sergey Ioffe and Christian Szegedy},
+  title       = {Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift},
+  abstract    = {Training Deep Neural Networks is complicated by the fact that the distribution of each layer's inputs changes during training, as the parameters of the previous layers change. This slows down the training by requiring lower learning rates and careful parameter initialization, and makes it notoriously hard to train models with saturating nonlinearities. We refer to this phenomenon as internal covariate shift, and address the problem by normalizing layer inputs. Our method draws its strength from making normalization a part of the model architecture and performing the normalization for each training mini-batch. Batch Normalization allows us to use much higher learning rates and be less careful about initialization. It also acts as a regularizer, in some cases eliminating the need for Dropout. Applied to a state-of-the-art image classification model, Batch Normalization achieves the same accuracy with 14 times fewer training steps, and beats the original model by a significant margin. Using an ensemble of batch-normalized networks, we improve upon the best published result on ImageNet classification: reaching 4.9% top-5 validation error (and 4.8% test error), exceeding the accuracy of human raters.},
+  date        = {2015-02-11},
+  eprint      = {1502.03167v3},
+  eprintclass = {cs.LG},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1502.03167v3:URL},
+  groups      = {Optimization, Classic literature},
+  keywords    = {cs.LG},
+  url         = {http://arxiv.org/pdf/1502.03167v3},
+}
+
+@Article{Real2018,
+  author   = {Real, Esteban and Aggarwal, Alok and Huang, Yanping and Le, Quoc V},
+  title    = {Regularized Evolution for Image Classifier Architecture Search},
+  journal  = {arXiv preprint arXiv:1802.01548},
+  year     = {2018},
+  groups   = {Computer Vision},
+  keywords = {Neural Architectue Search, NAS, evolution},
+  url      = {https://arxiv.org/pdf/1802.01548.pdf},
+}
+
+@Article{Simonyan2014,
+  author  = {Simonyan, Karen and Zisserman, Andrew},
+  title   = {Very deep convolutional networks for large-scale image recognition},
+  journal = {arXiv preprint arXiv:1409.1556},
+  year    = {2014},
+  groups  = {Computer Vision},
+  url     = {https://arxiv.org/pdf/1409.1556.pdf},
+}
+
+@Article{Krizhevsky2014,
+  author  = {Krizhevsky, Alex},
+  title   = {One weird trick for parallelizing convolutional neural networks},
+  journal = {arXiv preprint arXiv:1404.5997},
+  year    = {2014},
+  groups  = {Computer Vision},
+  url     = {https://arxiv.org/pdf/1404.5997.pdf},
+}
+
+@Article{Goyal2017,
+  author  = {Goyal, Priya and Doll{\'a}r, Piotr and Girshick, Ross and Noordhuis, Pieter and Wesolowski, Lukasz and Kyrola, Aapo and Tulloch, Andrew and Jia, Yangqing and He, Kaiming},
+  title   = {Accurate, large minibatch SGD: training imagenet in 1 hour},
+  journal = {arXiv preprint arXiv:1706.02677},
+  year    = {2017},
+  groups  = {Optimization, Large scale ML, Computer Vision},
+  url     = {https://arxiv.org/pdf/1706.02677.pdf},
+}
+
+@InProceedings{Dean2012,
+  author    = {Dean, Jeffrey and Corrado, Greg and Monga, Rajat and Chen, Kai and Devin, Matthieu and Mao, Mark and Senior, Andrew and Tucker, Paul and Yang, Ke and Le, Quoc V and others},
+  title     = {Large scale distributed deep networks},
+  booktitle = {Advances in neural information processing systems},
+  year      = {2012},
+  pages     = {1223--1231},
+  groups    = {Large scale ML},
+  url       = {https://static.googleusercontent.com/media/research.google.com/en//archive/large_deep_networks_nips2012.pdf},
+}
+
+@Article{Cueva2018,
+  author  = {Cueva, Christopher J and Wei, Xue-Xin},
+  title   = {Emergence of grid-like representations by training recurrent neural networks to perform spatial localization},
+  journal = {arXiv preprint arXiv:1803.07770},
+  year    = {2018},
+  url     = {https://arxiv.org/pdf/1803.07770.pdf},
+}
+
+@Article{Kingma2014,
+  author  = {Kingma, Diederik P and Ba, Jimmy},
+  title   = {Adam: A method for stochastic optimization},
+  journal = {arXiv preprint arXiv:1412.6980},
+  year    = {2014},
+  groups  = {Optimization},
+  url     = {https://arxiv.org/pdf/1412.6980.pdf},
+}
+
+@InProceedings{LeCun1990,
+  author    = {LeCun, Yann and Denker, John S and Solla, Sara A},
+  title     = {Optimal brain damage},
+  booktitle = {Advances in neural information processing systems},
+  year      = {1990},
+  pages     = {598--605},
+  groups    = {Classic literature, Optimization},
+  url       = {http://yann.lecun.com/exdb/publis/pdf/lecun-90b.pdf},
+}
+
+@Article{Molchanov2016,
+  author = {Molchanov, Pavlo and Tyree, Stephen and Karras, Tero and Aila, Timo and Kautz, Jan},
+  title  = {Pruning convolutional neural networks for resource efficient inference},
+  year   = {2016},
+  groups = {Optimization},
+  url    = {https://arxiv.org/pdf/1611.06440.pdf},
+}
+
+@Article{Li2016,
+  author      = {Hao Li and Asim Kadav and Igor Durdanovic and Hanan Samet and Hans Peter Graf},
+  title       = {Pruning Filters for Efficient ConvNets},
+  abstract    = {The success of CNNs in various applications is accompanied by a significant increase in the computation and parameter storage costs. Recent efforts toward reducing these overheads involve pruning and compressing the weights of various layers without hurting original accuracy. However, magnitude-based pruning of weights reduces a significant number of parameters from the fully connected layers and may not adequately reduce the computation costs in the convolutional layers due to irregular sparsity in the pruned networks. We present an acceleration method for CNNs, where we prune filters from CNNs that are identified as having a small effect on the output accuracy. By removing whole filters in the network together with their connecting feature maps, the computation costs are reduced significantly. In contrast to pruning weights, this approach does not result in sparse connectivity patterns. Hence, it does not need the support of sparse convolution libraries and can work with existing efficient BLAS libraries for dense matrix multiplications. We show that even simple filter pruning techniques can reduce inference costs for VGG-16 by up to 34% and ResNet-110 by up to 38% on CIFAR10 while regaining close to the original accuracy by retraining the networks.},
+  date        = {2016-08-31},
+  eprint      = {1608.08710v3},
+  eprintclass = {cs.CV},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1608.08710v3:PDF},
+  groups      = {Optimization},
+  keywords    = {cs.CV, cs.LG},
+  url         = {https://arxiv.org/pdf/1608.08710.pdf},
+}
+
+@Article{Tibshirani1996,
+  author    = {Tibshirani, Robert},
+  title     = {Regression shrinkage and selection via the lasso},
+  journal   = {Journal of the Royal Statistical Society. Series B (Methodological)},
+  year      = {1996},
+  pages     = {267--288},
+  groups    = {Optimization, Classic literature},
+  publisher = {JSTOR},
+}
+
+@Book{Goodfellow2016,
+  title     = {Deep learning},
+  publisher = {MIT press Cambridge},
+  year      = {2016},
+  author    = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron and Bengio, Yoshua},
+  volume    = {1},
+  groups    = {Reviews},
+}
+
+@Article{Louizos2017,
+  author  = {Louizos, Christos and Welling, Max and Kingma, Diederik P},
+  title   = {Learning Sparse Neural Networks through $ L\_0 $ Regularization},
+  journal = {arXiv preprint arXiv:1712.01312},
+  year    = {2017},
+  groups  = {Optimization},
+}
+
+@Article{Han2015,
+  author      = {Song Han and Huizi Mao and William J. Dally},
+  title       = {Deep Compression: Compressing Deep Neural Networks with Pruning, Trained Quantization and Huffman Coding},
+  abstract    = {Neural networks are both computationally intensive and memory intensive, making them difficult to deploy on embedded systems with limited hardware resources. To address this limitation, we introduce "deep compression", a three stage pipeline: pruning, trained quantization and Huffman coding, that work together to reduce the storage requirement of neural networks by 35x to 49x without affecting their accuracy. Our method first prunes the network by learning only the important connections. Next, we quantize the weights to enforce weight sharing, finally, we apply Huffman coding. After the first two steps we retrain the network to fine tune the remaining connections and the quantized centroids. Pruning, reduces the number of connections by 9x to 13x; Quantization then reduces the number of bits that represent each connection from 32 to 5. On the ImageNet dataset, our method reduced the storage required by AlexNet by 35x, from 240MB to 6.9MB, without loss of accuracy. Our method reduced the size of VGG-16 by 49x from 552MB to 11.3MB, again with no loss of accuracy. This allows fitting the model into on-chip SRAM cache rather than off-chip DRAM memory. Our compression method also facilitates the use of complex neural networks in mobile applications where application size and download bandwidth are constrained. Benchmarked on CPU, GPU and mobile GPU, compressed network has 3x to 4x layerwise speedup and 3x to 7x better energy efficiency.},
+  date        = {2015-10-01},
+  eprint      = {1510.00149v5},
+  eprintclass = {cs.CV},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1510.00149v5:PDF},
+  groups      = {Optimization},
+  keywords    = {cs.CV, cs.NE},
+  url         = {https://arxiv.org/pdf/1510.00149.pdf},
+}
+
+@Article{Szegedy2016,
+  author      = {Christian Szegedy and Sergey Ioffe and Vincent Vanhoucke and Alex Alemi},
+  title       = {Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning},
+  abstract    = {Very deep convolutional networks have been central to the largest advances in image recognition performance in recent years. One example is the Inception architecture that has been shown to achieve very good performance at relatively low computational cost. Recently, the introduction of residual connections in conjunction with a more traditional architecture has yielded state-of-the-art performance in the 2015 ILSVRC challenge; its performance was similar to the latest generation Inception-v3 network. This raises the question of whether there are any benefit in combining the Inception architecture with residual connections. Here we give clear empirical evidence that training with residual connections accelerates the training of Inception networks significantly. There is also some evidence of residual Inception networks outperforming similarly expensive Inception networks without residual connections by a thin margin. We also present several new streamlined architectures for both residual and non-residual Inception networks. These variations improve the single-frame recognition performance on the ILSVRC 2012 classification task significantly. We further demonstrate how proper activation scaling stabilizes the training of very wide residual Inception networks. With an ensemble of three residual and one Inception-v4, we achieve 3.08 percent top-5 error on the test set of the ImageNet classification (CLS) challenge},
+  date        = {2016-02-23},
+  eprint      = {1602.07261v2},
+  eprintclass = {cs.CV},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1602.07261v2:PDF},
+  groups      = {Computer Vision},
+  keywords    = {cs.CV},
+  url         = {https://arxiv.org/pdf/1602.07261v2.pdf},
+}
+
+@Article{Szegedy2015,
+  author      = {Christian Szegedy and Vincent Vanhoucke and Sergey Ioffe and Jonathon Shlens and Zbigniew Wojna},
+  title       = {Rethinking the Inception Architecture for Computer Vision},
+  abstract    = {Convolutional networks are at the core of most state-of-the-art computer vision solutions for a wide variety of tasks. Since 2014 very deep convolutional networks started to become mainstream, yielding substantial gains in various benchmarks. Although increased model size and computational cost tend to translate to immediate quality gains for most tasks (as long as enough labeled data is provided for training), computational efficiency and low parameter count are still enabling factors for various use cases such as mobile vision and big-data scenarios. Here we explore ways to scale up networks in ways that aim at utilizing the added computation as efficiently as possible by suitably factorized convolutions and aggressive regularization. We benchmark our methods on the ILSVRC 2012 classification challenge validation set demonstrate substantial gains over the state of the art: 21.2% top-1 and 5.6% top-5 error for single frame evaluation using a network with a computational cost of 5 billion multiply-adds per inference and with using less than 25 million parameters. With an ensemble of 4 models and multi-crop evaluation, we report 3.5% top-5 error on the validation set (3.6% error on the test set) and 17.3% top-1 error on the validation set.},
+  date        = {2015-12-02},
+  eprint      = {1512.00567v3},
+  eprintclass = {cs.CV},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1512.00567v3:PDF},
+  groups      = {Computer Vision},
+  keywords    = {cs.CV},
+  url         = {https://arxiv.org/pdf/1512.00567.pdf},
+}
+
+@Article{He2015,
+  author      = {Kaiming He and Xiangyu Zhang and Shaoqing Ren and Jian Sun},
+  title       = {Deep Residual Learning for Image Recognition},
+  abstract    = {Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers---8x deeper than VGG nets but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation.},
+  date        = {2015-12-10},
+  eprint      = {1512.03385v1},
+  eprintclass = {cs.CV},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1512.03385v1:PDF},
+  groups      = {Computer Vision},
+  keywords    = {cs.CV},
+  url         = {https://arxiv.org/pdf/1512.03385.pdf},
+}
+
+@Article{Han2015a,
+  author      = {Song Han and Jeff Pool and John Tran and William J. Dally},
+  title       = {Learning both Weights and Connections for Efficient Neural Networks},
+  abstract    = {Neural networks are both computationally intensive and memory intensive, making them difficult to deploy on embedded systems. Also, conventional networks fix the architecture before training starts; as a result, training cannot improve the architecture. To address these limitations, we describe a method to reduce the storage and computation required by neural networks by an order of magnitude without affecting their accuracy by learning only the important connections. Our method prunes redundant connections using a three-step method. First, we train the network to learn which connections are important. Next, we prune the unimportant connections. Finally, we retrain the network to fine tune the weights of the remaining connections. On the ImageNet dataset, our method reduced the number of parameters of AlexNet by a factor of 9x, from 61 million to 6.7 million, without incurring accuracy loss. Similar experiments with VGG-16 found that the number of parameters can be reduced by 13x, from 138 million to 10.3 million, again with no loss of accuracy.},
+  date        = {2015-06-08},
+  eprint      = {1506.02626v3},
+  eprintclass = {cs.NE},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1506.02626v3:PDF},
+  groups      = {Optimization},
+  keywords    = {cs.NE, cs.CV, cs.LG},
+  url         = {https://arxiv.org/pdf/1506.02626.pdf},
+}
+
+@Article{Howard2017,
+  author      = {Andrew G. Howard and Menglong Zhu and Bo Chen and Dmitry Kalenichenko and Weijun Wang and Tobias Weyand and Marco Andreetto and Hartwig Adam},
+  title       = {MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications},
+  abstract    = {We present a class of efficient models called MobileNets for mobile and embedded vision applications. MobileNets are based on a streamlined architecture that uses depth-wise separable convolutions to build light weight deep neural networks. We introduce two simple global hyper-parameters that efficiently trade off between latency and accuracy. These hyper-parameters allow the model builder to choose the right sized model for their application based on the constraints of the problem. We present extensive experiments on resource and accuracy tradeoffs and show strong performance compared to other popular models on ImageNet classification. We then demonstrate the effectiveness of MobileNets across a wide range of applications and use cases including object detection, finegrain classification, face attributes and large scale geo-localization.},
+  date        = {2017-04-17},
+  eprint      = {1704.04861v1},
+  eprintclass = {cs.CV},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1704.04861v1:PDF},
+  groups      = {Optimization, Computer Vision},
+  keywords    = {cs.CV},
+  url         = {https://arxiv.org/pdf/1704.04861.pdf},
+}
+
+@Article{Courbariaux2016,
+  author      = {Matthieu Courbariaux and Itay Hubara and Daniel Soudry and Ran El-Yaniv and Yoshua Bengio},
+  title       = {Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1},
+  abstract    = {We introduce a method to train Binarized Neural Networks (BNNs) - neural networks with binary weights and activations at run-time. At training-time the binary weights and activations are used for computing the parameters gradients. During the forward pass, BNNs drastically reduce memory size and accesses, and replace most arithmetic operations with bit-wise operations, which is expected to substantially improve power-efficiency. To validate the effectiveness of BNNs we conduct two sets of experiments on the Torch7 and Theano frameworks. On both, BNNs achieved nearly state-of-the-art results over the MNIST, CIFAR-10 and SVHN datasets. Last but not least, we wrote a binary matrix multiplication GPU kernel with which it is possible to run our MNIST BNN 7 times faster than with an unoptimized GPU kernel, without suffering any loss in classification accuracy. The code for training and running our BNNs is available on-line.},
+  date        = {2016-02-09},
+  eprint      = {1602.02830v3},
+  eprintclass = {cs.LG},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1602.02830v3:PDF},
+  groups      = {Optimization},
+  keywords    = {cs.LG},
+}
+
+@Article{Bengio2013a,
+  author      = {Yoshua Bengio},
+  title       = {Estimating or Propagating Gradients Through Stochastic Neurons},
+  abstract    = {Stochastic neurons can be useful for a number of reasons in deep learning models, but in many cases they pose a challenging problem: how to estimate the gradient of a loss function with respect to the input of such stochastic neurons, i.e., can we "back-propagate" through these stochastic neurons? We examine this question, existing approaches, and present two novel families of solutions, applicable in different settings. In particular, it is demonstrated that a simple biologically plausible formula gives rise to an an unbiased (but noisy) estimator of the gradient with respect to a binary stochastic neuron firing probability. Unlike other estimators which view the noise as a small perturbation in order to estimate gradients by finite differences, this estimator is unbiased even without assuming that the stochastic perturbation is small. This estimator is also interesting because it can be applied in very general settings which do not allow gradient back-propagation, including the estimation of the gradient with respect to future rewards, as required in reinforcement learning setups. We also propose an approach to approximating this unbiased but high-variance estimator by learning to predict it using a biased estimator. The second approach we propose assumes that an estimator of the gradient can be back-propagated and it provides an unbiased estimator of the gradient, but can only work with non-linearities unlike the hard threshold, but like the rectifier, that are not flat for all of their range. This is similar to traditional sigmoidal units but has the advantage that for many inputs, a hard decision (e.g., a 0 output) can be produced, which would be convenient for conditional computation and achieving sparse representations and sparse gradients.},
+  date        = {2013-05-14},
+  eprint      = {1305.2982v1},
+  eprintclass = {cs.LG},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1305.2982v1:PDF},
+  groups      = {Optimization},
+  keywords    = {cs.LG},
+}
+
+@InProceedings{Real2017,
+  author       = {Real, Esteban and Moore, Sherry and Selle, Andrew and Saxena, Saurabh and Suematsu, Yutaka Leon and Tan, Jie and Le, Quoc V and Kurakin, Alexey},
+  title        = {Large-scale evolution of image classifiers},
+  booktitle    = {Proceedings of the 34th International Conference on Machine Learning-Volume 70},
+  year         = {2017},
+  pages        = {2902--2911},
+  organization = {JMLR. org},
+  groups       = {Computer Vision},
+  keywords     = {Neural Architectue Search, NAS, evolution},
+  url          = {https://arxiv.org/pdf/1703.01041.pdf},
+}
+
+@Article{Elsken2019,
+  author   = {Elsken, Thomas and Metzen, Jan Hendrik and Hutter, Frank},
+  title    = {Neural Architecture Search: A Survey},
+  journal  = {Journal of Machine Learning Research},
+  year     = {2019},
+  volume   = {20},
+  number   = {55},
+  pages    = {1--21},
+  groups   = {Reviews},
+  keywords = {Neural Architectue Search, NAS, Review},
+  url      = {https://arxiv.org/pdf/1808.05377.pdf},
+}
+
+@Article{Zela2018,
+  author   = {Zela, Arber and Klein, Aaron and Falkner, Stefan and Hutter, Frank},
+  title    = {Towards automated deep learning: Efficient joint neural architecture and hyperparameter search},
+  journal  = {arXiv preprint arXiv:1807.06906},
+  year     = {2018},
+  keywords = {Neural Architectue Search, NAS},
+  url      = {https://arxiv.org/pdf/1807.06906.pdf},
+}
+
+@Article{Pham2018,
+  author   = {Pham, Hieu and Guan, Melody Y and Zoph, Barret and Le, Quoc V and Dean, Jeff},
+  title    = {Efficient neural architecture search via parameter sharing},
+  journal  = {arXiv preprint arXiv:1802.03268},
+  year     = {2018},
+  keywords = {NAS, Neural Architecture Search, weight sharing, reinforcement learning},
+  url      = {https://arxiv.org/pdf/1802.03268.pdf},
+}
+
+@Article{Zoph2016,
+  author   = {Zoph, Barret and Le, Quoc V},
+  title    = {Neural architecture search with reinforcement learning},
+  journal  = {arXiv preprint arXiv:1611.01578},
+  year     = {2016},
+  keywords = {Neural Architecture Search, NAS, Reinforcement learning},
+  url      = {https://arxiv.org/pdf/1611.01578.pdf},
+}
+
+@Article{Snoek2012,
+  author        = {Jasper Snoek and Hugo Larochelle and Ryan P. Adams},
+  title         = {Practical Bayesian Optimization of Machine Learning Algorithms},
+  __markedentry = {[djustus:]},
+  abstract      = {Machine learning algorithms frequently require careful tuning of model hyperparameters, regularization terms, and optimization parameters. Unfortunately, this tuning is often a "black art" that requires expert experience, unwritten rules of thumb, or sometimes brute-force search. Much more appealing is the idea of developing automatic approaches which can optimize the performance of a given learning algorithm to the task at hand. In this work, we consider the automatic tuning problem within the framework of Bayesian optimization, in which a learning algorithm's generalization performance is modeled as a sample from a Gaussian process (GP). The tractable posterior distribution induced by the GP leads to efficient use of the information gathered by previous experiments, enabling optimal choices about what parameters to try next. Here we show how the effects of the Gaussian process prior and the associated inference procedure can have a large impact on the success or failure of Bayesian optimization. We show that thoughtful choices can lead to results that exceed expert-level performance in tuning machine learning algorithms. We also describe new algorithms that take into account the variable cost (duration) of learning experiments and that can leverage the presence of multiple cores for parallel experimentation. We show that these proposed algorithms improve on previous automatic procedures and can reach or surpass human expert-level optimization on a diverse set of contemporary algorithms including latent Dirichlet allocation, structured SVMs and convolutional neural networks.},
+  date          = {2012-06-13},
+  eprint        = {1206.2944v2},
+  eprintclass   = {stat.ML},
+  eprinttype    = {arXiv},
+  file          = {online:http\://arxiv.org/pdf/1206.2944v2:PDF},
+  keywords      = {stat.ML, cs.LG},
+  url           = {https://papers.nips.cc/paper/4522-practical-bayesian-optimization-of-machine-learning-algorithms.pdf},
+}
+
+@Comment{jabref-meta: databaseType:bibtex;}
+
+@Comment{jabref-meta: grouping:
+0 AllEntriesGroup:;
+1 StaticGroup:Large scale ML\;0\;1\;\;\;\;;
+1 StaticGroup:Computer Vision\;0\;1\;\;\;\;;
+1 StaticGroup:NLP\;0\;1\;\;\;\;;
+1 StaticGroup:Classic literature\;0\;1\;\;\;\;;
+1 StaticGroup:Reviews\;0\;1\;\;\;\;;
+1 StaticGroup:Benchmarking\;0\;1\;\;\;\;;
+1 StaticGroup:Optimization\;0\;1\;\;\;\;;
+}
diff --git a/ML_lit.bib.bak b/ML_lit.bib.bak
new file mode 100644
index 0000000..7f6afb5
--- /dev/null
+++ b/ML_lit.bib.bak
@@ -0,0 +1,430 @@
+% Encoding: UTF-8
+
+@Article{Ben-Nun2018a,
+  author  = {Ben-Nun, Tal and Hoefler, Torsten},
+  title   = {Demystifying Parallel and Distributed Deep Learning: An In-Depth Concurrency Analysis},
+  journal = {arXiv preprint arXiv:1802.09941},
+  year    = {2018},
+  groups  = {Large scale ML, Reviews},
+  url     = {https://arxiv.org/pdf/1802.09941.pdf},
+}
+
+@InProceedings{Krizhevsky2012,
+  author    = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
+  title     = {Imagenet classification with deep convolutional neural networks},
+  booktitle = {Advances in neural information processing systems},
+  year      = {2012},
+  pages     = {1097--1105},
+  groups    = {CNNs, Classic literature},
+  url       = {https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf},
+}
+
+@Article{LeCun1989,
+  author    = {LeCun, Yann and Boser, Bernhard and Denker, John S and Henderson, Donnie and Howard, Richard E and Hubbard, Wayne and Jackel, Lawrence D},
+  title     = {Backpropagation applied to handwritten zip code recognition},
+  journal   = {Neural computation},
+  year      = {1989},
+  volume    = {1},
+  number    = {4},
+  pages     = {541--551},
+  groups    = {Classic literature},
+  publisher = {MIT Press},
+}
+
+@Article{Coleman2017,
+  author  = {Coleman, Cody and Narayanan, Deepak and Kang, Daniel and Zhao, Tian and Zhang, Jian and Nardi, Luigi and Bailis, Peter and Olukotun, Kunle and R{\'e}, Chris and Zaharia, Matei},
+  title   = {DAWNBench: An End-to-End Deep Learning Benchmark and Competition},
+  journal = {Training},
+  year    = {2017},
+  volume  = {100},
+  number  = {101},
+  pages   = {102},
+  groups  = {Benchmarking},
+  url     = {http://dawn.cs.stanford.edu/benchmark/papers/nips17-dawnbench.pdf},
+}
+
+@InProceedings{Adolf2016,
+  author       = {Adolf, Robert and Rama, Saketh and Reagen, Brandon and Wei, Gu-Yeon and Brooks, David},
+  title        = {Fathom: Reference workloads for modern deep learning methods},
+  booktitle    = {Workload Characterization (IISWC), 2016 IEEE International Symposium on},
+  year         = {2016},
+  pages        = {1--10},
+  organization = {IEEE},
+  groups       = {Benchmarking},
+  url          = {https://arxiv.org/pdf/1608.06581.pdf},
+}
+
+@Article{Shi2017,
+  author  = {Shi, Shaohuai and Chu, Xiaowen},
+  title   = {Performance Modeling and Evaluation of Distributed Deep Learning Frameworks on GPUs},
+  journal = {arXiv preprint arXiv:1711.05979},
+  year    = {2017},
+  groups  = {Benchmarking},
+  url     = {https://arxiv.org/pdf/1711.05979.pdf},
+}
+
+@Article{Bahrampour2015,
+  author  = {Bahrampour, Soheil and Ramakrishnan, Naveen and Schott, Lukas and Shah, Mohak},
+  title   = {Comparative study of deep learning software frameworks},
+  journal = {arXiv preprint arXiv:1511.06435},
+  year    = {2015},
+  groups  = {Benchmarking},
+  url     = {https://arxiv.org/pdf/1511.06435.pdf},
+}
+
+@InProceedings{Shams2017,
+  author       = {Shams, Shayan and Platania, Richard and Lee, Kisung and Park, Seung-Jong},
+  title        = {Evaluation of deep learning frameworks over different HPC architectures},
+  booktitle    = {Distributed Computing Systems (ICDCS), 2017 IEEE 37th International Conference on},
+  year         = {2017},
+  pages        = {1389--1396},
+  organization = {IEEE},
+  groups       = {Benchmarking},
+  url          = {https://ieeexplore.ieee.org/document/7980078/?reload=true},
+}
+
+@Article{Qi2016,
+  author = {Qi, Hang and Sparks, Evan R and Talwalkar, Ameet},
+  title  = {Paleo: A performance model for deep neural networks},
+  year   = {2016},
+  groups = {Benchmarking},
+  url    = {https://openreview.net/pdf?id=SyVVJ85lg},
+}
+
+@InProceedings{Adolf2016a,
+  author       = {Adolf, Robert and Rama, Saketh and Reagen, Brandon and Wei, Gu-Yeon and Brooks, David},
+  title        = {Fathom: Reference workloads for modern deep learning methods},
+  booktitle    = {Workload Characterization (IISWC), 2016 IEEE International Symposium on},
+  year         = {2016},
+  pages        = {1--10},
+  organization = {IEEE},
+  groups       = {Benchmarking},
+  url          = {https://arxiv.org/pdf/1608.06581.pdf},
+}
+
+@Article{Ioffe2015,
+  author      = {Sergey Ioffe and Christian Szegedy},
+  title       = {Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift},
+  abstract    = {Training Deep Neural Networks is complicated by the fact that the distribution of each layer's inputs changes during training, as the parameters of the previous layers change. This slows down the training by requiring lower learning rates and careful parameter initialization, and makes it notoriously hard to train models with saturating nonlinearities. We refer to this phenomenon as internal covariate shift, and address the problem by normalizing layer inputs. Our method draws its strength from making normalization a part of the model architecture and performing the normalization for each training mini-batch. Batch Normalization allows us to use much higher learning rates and be less careful about initialization. It also acts as a regularizer, in some cases eliminating the need for Dropout. Applied to a state-of-the-art image classification model, Batch Normalization achieves the same accuracy with 14 times fewer training steps, and beats the original model by a significant margin. Using an ensemble of batch-normalized networks, we improve upon the best published result on ImageNet classification: reaching 4.9% top-5 validation error (and 4.8% test error), exceeding the accuracy of human raters.},
+  date        = {2015-02-11},
+  eprint      = {1502.03167v3},
+  eprintclass = {cs.LG},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1502.03167v3:URL},
+  groups      = {Optimization, Classic literature},
+  keywords    = {cs.LG},
+  url         = {http://arxiv.org/pdf/1502.03167v3},
+}
+
+@Article{Real2018,
+  author   = {Real, Esteban and Aggarwal, Alok and Huang, Yanping and Le, Quoc V},
+  title    = {Regularized Evolution for Image Classifier Architecture Search},
+  journal  = {arXiv preprint arXiv:1802.01548},
+  year     = {2018},
+  groups   = {CNNs},
+  keywords = {Neural Architectue Search, NAS, evolution},
+  url      = {https://arxiv.org/pdf/1802.01548.pdf},
+}
+
+@Article{Simonyan2014,
+  author  = {Simonyan, Karen and Zisserman, Andrew},
+  title   = {Very deep convolutional networks for large-scale image recognition},
+  journal = {arXiv preprint arXiv:1409.1556},
+  year    = {2014},
+  groups  = {CNNs},
+  url     = {https://arxiv.org/pdf/1409.1556.pdf},
+}
+
+@Article{Krizhevsky2014,
+  author  = {Krizhevsky, Alex},
+  title   = {One weird trick for parallelizing convolutional neural networks},
+  journal = {arXiv preprint arXiv:1404.5997},
+  year    = {2014},
+  groups  = {CNNs},
+  url     = {https://arxiv.org/pdf/1404.5997.pdf},
+}
+
+@Article{Goyal2017,
+  author  = {Goyal, Priya and Doll{\'a}r, Piotr and Girshick, Ross and Noordhuis, Pieter and Wesolowski, Lukasz and Kyrola, Aapo and Tulloch, Andrew and Jia, Yangqing and He, Kaiming},
+  title   = {Accurate, large minibatch SGD: training imagenet in 1 hour},
+  journal = {arXiv preprint arXiv:1706.02677},
+  year    = {2017},
+  groups  = {CNNs, Optimization, Large scale ML},
+  url     = {https://arxiv.org/pdf/1706.02677.pdf},
+}
+
+@InProceedings{Dean2012,
+  author    = {Dean, Jeffrey and Corrado, Greg and Monga, Rajat and Chen, Kai and Devin, Matthieu and Mao, Mark and Senior, Andrew and Tucker, Paul and Yang, Ke and Le, Quoc V and others},
+  title     = {Large scale distributed deep networks},
+  booktitle = {Advances in neural information processing systems},
+  year      = {2012},
+  pages     = {1223--1231},
+  groups    = {Large scale ML},
+  url       = {https://static.googleusercontent.com/media/research.google.com/en//archive/large_deep_networks_nips2012.pdf},
+}
+
+@Article{Cueva2018,
+  author  = {Cueva, Christopher J and Wei, Xue-Xin},
+  title   = {Emergence of grid-like representations by training recurrent neural networks to perform spatial localization},
+  journal = {arXiv preprint arXiv:1803.07770},
+  year    = {2018},
+  url     = {https://arxiv.org/pdf/1803.07770.pdf},
+}
+
+@Article{Kingma2014,
+  author  = {Kingma, Diederik P and Ba, Jimmy},
+  title   = {Adam: A method for stochastic optimization},
+  journal = {arXiv preprint arXiv:1412.6980},
+  year    = {2014},
+  groups  = {Optimization},
+  url     = {https://arxiv.org/pdf/1412.6980.pdf},
+}
+
+@InProceedings{LeCun1990,
+  author    = {LeCun, Yann and Denker, John S and Solla, Sara A},
+  title     = {Optimal brain damage},
+  booktitle = {Advances in neural information processing systems},
+  year      = {1990},
+  pages     = {598--605},
+  groups    = {Classic literature, Optimization},
+  url       = {http://yann.lecun.com/exdb/publis/pdf/lecun-90b.pdf},
+}
+
+@Article{Molchanov2016,
+  author = {Molchanov, Pavlo and Tyree, Stephen and Karras, Tero and Aila, Timo and Kautz, Jan},
+  title  = {Pruning convolutional neural networks for resource efficient inference},
+  year   = {2016},
+  groups = {Optimization},
+  url    = {https://arxiv.org/pdf/1611.06440.pdf},
+}
+
+@Article{Li2016,
+  author      = {Hao Li and Asim Kadav and Igor Durdanovic and Hanan Samet and Hans Peter Graf},
+  title       = {Pruning Filters for Efficient ConvNets},
+  abstract    = {The success of CNNs in various applications is accompanied by a significant increase in the computation and parameter storage costs. Recent efforts toward reducing these overheads involve pruning and compressing the weights of various layers without hurting original accuracy. However, magnitude-based pruning of weights reduces a significant number of parameters from the fully connected layers and may not adequately reduce the computation costs in the convolutional layers due to irregular sparsity in the pruned networks. We present an acceleration method for CNNs, where we prune filters from CNNs that are identified as having a small effect on the output accuracy. By removing whole filters in the network together with their connecting feature maps, the computation costs are reduced significantly. In contrast to pruning weights, this approach does not result in sparse connectivity patterns. Hence, it does not need the support of sparse convolution libraries and can work with existing efficient BLAS libraries for dense matrix multiplications. We show that even simple filter pruning techniques can reduce inference costs for VGG-16 by up to 34% and ResNet-110 by up to 38% on CIFAR10 while regaining close to the original accuracy by retraining the networks.},
+  date        = {2016-08-31},
+  eprint      = {1608.08710v3},
+  eprintclass = {cs.CV},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1608.08710v3:PDF},
+  groups      = {Optimization},
+  keywords    = {cs.CV, cs.LG},
+  url         = {https://arxiv.org/pdf/1608.08710.pdf},
+}
+
+@Article{Tibshirani1996,
+  author    = {Tibshirani, Robert},
+  title     = {Regression shrinkage and selection via the lasso},
+  journal   = {Journal of the Royal Statistical Society. Series B (Methodological)},
+  year      = {1996},
+  pages     = {267--288},
+  groups    = {Optimization, Classic literature},
+  publisher = {JSTOR},
+}
+
+@Book{Goodfellow2016,
+  title     = {Deep learning},
+  publisher = {MIT press Cambridge},
+  year      = {2016},
+  author    = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron and Bengio, Yoshua},
+  volume    = {1},
+  groups    = {Reviews},
+}
+
+@Article{Louizos2017,
+  author  = {Louizos, Christos and Welling, Max and Kingma, Diederik P},
+  title   = {Learning Sparse Neural Networks through $ L\_0 $ Regularization},
+  journal = {arXiv preprint arXiv:1712.01312},
+  year    = {2017},
+  groups  = {Optimization},
+}
+
+@Article{Han2015,
+  author      = {Song Han and Huizi Mao and William J. Dally},
+  title       = {Deep Compression: Compressing Deep Neural Networks with Pruning, Trained Quantization and Huffman Coding},
+  abstract    = {Neural networks are both computationally intensive and memory intensive, making them difficult to deploy on embedded systems with limited hardware resources. To address this limitation, we introduce "deep compression", a three stage pipeline: pruning, trained quantization and Huffman coding, that work together to reduce the storage requirement of neural networks by 35x to 49x without affecting their accuracy. Our method first prunes the network by learning only the important connections. Next, we quantize the weights to enforce weight sharing, finally, we apply Huffman coding. After the first two steps we retrain the network to fine tune the remaining connections and the quantized centroids. Pruning, reduces the number of connections by 9x to 13x; Quantization then reduces the number of bits that represent each connection from 32 to 5. On the ImageNet dataset, our method reduced the storage required by AlexNet by 35x, from 240MB to 6.9MB, without loss of accuracy. Our method reduced the size of VGG-16 by 49x from 552MB to 11.3MB, again with no loss of accuracy. This allows fitting the model into on-chip SRAM cache rather than off-chip DRAM memory. Our compression method also facilitates the use of complex neural networks in mobile applications where application size and download bandwidth are constrained. Benchmarked on CPU, GPU and mobile GPU, compressed network has 3x to 4x layerwise speedup and 3x to 7x better energy efficiency.},
+  date        = {2015-10-01},
+  eprint      = {1510.00149v5},
+  eprintclass = {cs.CV},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1510.00149v5:PDF},
+  groups      = {Optimization},
+  keywords    = {cs.CV, cs.NE},
+  url         = {https://arxiv.org/pdf/1510.00149.pdf},
+}
+
+@Article{Szegedy2016,
+  author      = {Christian Szegedy and Sergey Ioffe and Vincent Vanhoucke and Alex Alemi},
+  title       = {Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning},
+  abstract    = {Very deep convolutional networks have been central to the largest advances in image recognition performance in recent years. One example is the Inception architecture that has been shown to achieve very good performance at relatively low computational cost. Recently, the introduction of residual connections in conjunction with a more traditional architecture has yielded state-of-the-art performance in the 2015 ILSVRC challenge; its performance was similar to the latest generation Inception-v3 network. This raises the question of whether there are any benefit in combining the Inception architecture with residual connections. Here we give clear empirical evidence that training with residual connections accelerates the training of Inception networks significantly. There is also some evidence of residual Inception networks outperforming similarly expensive Inception networks without residual connections by a thin margin. We also present several new streamlined architectures for both residual and non-residual Inception networks. These variations improve the single-frame recognition performance on the ILSVRC 2012 classification task significantly. We further demonstrate how proper activation scaling stabilizes the training of very wide residual Inception networks. With an ensemble of three residual and one Inception-v4, we achieve 3.08 percent top-5 error on the test set of the ImageNet classification (CLS) challenge},
+  date        = {2016-02-23},
+  eprint      = {1602.07261v2},
+  eprintclass = {cs.CV},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1602.07261v2:PDF},
+  groups      = {CNNs},
+  keywords    = {cs.CV},
+  url         = {https://arxiv.org/pdf/1602.07261v2.pdf},
+}
+
+@Article{Szegedy2015,
+  author      = {Christian Szegedy and Vincent Vanhoucke and Sergey Ioffe and Jonathon Shlens and Zbigniew Wojna},
+  title       = {Rethinking the Inception Architecture for Computer Vision},
+  abstract    = {Convolutional networks are at the core of most state-of-the-art computer vision solutions for a wide variety of tasks. Since 2014 very deep convolutional networks started to become mainstream, yielding substantial gains in various benchmarks. Although increased model size and computational cost tend to translate to immediate quality gains for most tasks (as long as enough labeled data is provided for training), computational efficiency and low parameter count are still enabling factors for various use cases such as mobile vision and big-data scenarios. Here we explore ways to scale up networks in ways that aim at utilizing the added computation as efficiently as possible by suitably factorized convolutions and aggressive regularization. We benchmark our methods on the ILSVRC 2012 classification challenge validation set demonstrate substantial gains over the state of the art: 21.2% top-1 and 5.6% top-5 error for single frame evaluation using a network with a computational cost of 5 billion multiply-adds per inference and with using less than 25 million parameters. With an ensemble of 4 models and multi-crop evaluation, we report 3.5% top-5 error on the validation set (3.6% error on the test set) and 17.3% top-1 error on the validation set.},
+  date        = {2015-12-02},
+  eprint      = {1512.00567v3},
+  eprintclass = {cs.CV},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1512.00567v3:PDF},
+  groups      = {CNNs},
+  keywords    = {cs.CV},
+  url         = {https://arxiv.org/pdf/1512.00567.pdf},
+}
+
+@Article{He2015,
+  author      = {Kaiming He and Xiangyu Zhang and Shaoqing Ren and Jian Sun},
+  title       = {Deep Residual Learning for Image Recognition},
+  abstract    = {Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers---8x deeper than VGG nets but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation.},
+  date        = {2015-12-10},
+  eprint      = {1512.03385v1},
+  eprintclass = {cs.CV},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1512.03385v1:PDF},
+  groups      = {CNNs},
+  keywords    = {cs.CV},
+  url         = {https://arxiv.org/pdf/1512.03385.pdf},
+}
+
+@Article{Han2015a,
+  author      = {Song Han and Jeff Pool and John Tran and William J. Dally},
+  title       = {Learning both Weights and Connections for Efficient Neural Networks},
+  abstract    = {Neural networks are both computationally intensive and memory intensive, making them difficult to deploy on embedded systems. Also, conventional networks fix the architecture before training starts; as a result, training cannot improve the architecture. To address these limitations, we describe a method to reduce the storage and computation required by neural networks by an order of magnitude without affecting their accuracy by learning only the important connections. Our method prunes redundant connections using a three-step method. First, we train the network to learn which connections are important. Next, we prune the unimportant connections. Finally, we retrain the network to fine tune the weights of the remaining connections. On the ImageNet dataset, our method reduced the number of parameters of AlexNet by a factor of 9x, from 61 million to 6.7 million, without incurring accuracy loss. Similar experiments with VGG-16 found that the number of parameters can be reduced by 13x, from 138 million to 10.3 million, again with no loss of accuracy.},
+  date        = {2015-06-08},
+  eprint      = {1506.02626v3},
+  eprintclass = {cs.NE},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1506.02626v3:PDF},
+  groups      = {Optimization},
+  keywords    = {cs.NE, cs.CV, cs.LG},
+  url         = {https://arxiv.org/pdf/1506.02626.pdf},
+}
+
+@Article{Howard2017,
+  author      = {Andrew G. Howard and Menglong Zhu and Bo Chen and Dmitry Kalenichenko and Weijun Wang and Tobias Weyand and Marco Andreetto and Hartwig Adam},
+  title       = {MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications},
+  abstract    = {We present a class of efficient models called MobileNets for mobile and embedded vision applications. MobileNets are based on a streamlined architecture that uses depth-wise separable convolutions to build light weight deep neural networks. We introduce two simple global hyper-parameters that efficiently trade off between latency and accuracy. These hyper-parameters allow the model builder to choose the right sized model for their application based on the constraints of the problem. We present extensive experiments on resource and accuracy tradeoffs and show strong performance compared to other popular models on ImageNet classification. We then demonstrate the effectiveness of MobileNets across a wide range of applications and use cases including object detection, finegrain classification, face attributes and large scale geo-localization.},
+  date        = {2017-04-17},
+  eprint      = {1704.04861v1},
+  eprintclass = {cs.CV},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1704.04861v1:PDF},
+  groups      = {Optimization, CNNs},
+  keywords    = {cs.CV},
+  url         = {https://arxiv.org/pdf/1704.04861.pdf},
+}
+
+@Article{Courbariaux2016,
+  author      = {Matthieu Courbariaux and Itay Hubara and Daniel Soudry and Ran El-Yaniv and Yoshua Bengio},
+  title       = {Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1},
+  abstract    = {We introduce a method to train Binarized Neural Networks (BNNs) - neural networks with binary weights and activations at run-time. At training-time the binary weights and activations are used for computing the parameters gradients. During the forward pass, BNNs drastically reduce memory size and accesses, and replace most arithmetic operations with bit-wise operations, which is expected to substantially improve power-efficiency. To validate the effectiveness of BNNs we conduct two sets of experiments on the Torch7 and Theano frameworks. On both, BNNs achieved nearly state-of-the-art results over the MNIST, CIFAR-10 and SVHN datasets. Last but not least, we wrote a binary matrix multiplication GPU kernel with which it is possible to run our MNIST BNN 7 times faster than with an unoptimized GPU kernel, without suffering any loss in classification accuracy. The code for training and running our BNNs is available on-line.},
+  date        = {2016-02-09},
+  eprint      = {1602.02830v3},
+  eprintclass = {cs.LG},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1602.02830v3:PDF},
+  groups      = {Optimization},
+  keywords    = {cs.LG},
+}
+
+@Article{Bengio2013a,
+  author      = {Yoshua Bengio},
+  title       = {Estimating or Propagating Gradients Through Stochastic Neurons},
+  abstract    = {Stochastic neurons can be useful for a number of reasons in deep learning models, but in many cases they pose a challenging problem: how to estimate the gradient of a loss function with respect to the input of such stochastic neurons, i.e., can we "back-propagate" through these stochastic neurons? We examine this question, existing approaches, and present two novel families of solutions, applicable in different settings. In particular, it is demonstrated that a simple biologically plausible formula gives rise to an an unbiased (but noisy) estimator of the gradient with respect to a binary stochastic neuron firing probability. Unlike other estimators which view the noise as a small perturbation in order to estimate gradients by finite differences, this estimator is unbiased even without assuming that the stochastic perturbation is small. This estimator is also interesting because it can be applied in very general settings which do not allow gradient back-propagation, including the estimation of the gradient with respect to future rewards, as required in reinforcement learning setups. We also propose an approach to approximating this unbiased but high-variance estimator by learning to predict it using a biased estimator. The second approach we propose assumes that an estimator of the gradient can be back-propagated and it provides an unbiased estimator of the gradient, but can only work with non-linearities unlike the hard threshold, but like the rectifier, that are not flat for all of their range. This is similar to traditional sigmoidal units but has the advantage that for many inputs, a hard decision (e.g., a 0 output) can be produced, which would be convenient for conditional computation and achieving sparse representations and sparse gradients.},
+  date        = {2013-05-14},
+  eprint      = {1305.2982v1},
+  eprintclass = {cs.LG},
+  eprinttype  = {arXiv},
+  file        = {online:http\://arxiv.org/pdf/1305.2982v1:PDF},
+  groups      = {Optimization},
+  keywords    = {cs.LG},
+}
+
+@InProceedings{Real2017,
+  author       = {Real, Esteban and Moore, Sherry and Selle, Andrew and Saxena, Saurabh and Suematsu, Yutaka Leon and Tan, Jie and Le, Quoc V and Kurakin, Alexey},
+  title        = {Large-scale evolution of image classifiers},
+  booktitle    = {Proceedings of the 34th International Conference on Machine Learning-Volume 70},
+  year         = {2017},
+  pages        = {2902--2911},
+  organization = {JMLR. org},
+  groups       = {CNNs},
+  keywords     = {Neural Architectue Search, NAS, evolution},
+  url          = {https://arxiv.org/pdf/1703.01041.pdf},
+}
+
+@Article{Elsken2019,
+  author   = {Elsken, Thomas and Metzen, Jan Hendrik and Hutter, Frank},
+  title    = {Neural Architecture Search: A Survey},
+  journal  = {Journal of Machine Learning Research},
+  year     = {2019},
+  volume   = {20},
+  number   = {55},
+  pages    = {1--21},
+  groups   = {Reviews},
+  keywords = {Neural Architectue Search, NAS, Review},
+  url      = {https://arxiv.org/pdf/1808.05377.pdf},
+}
+
+@Article{Zela2018,
+  author   = {Zela, Arber and Klein, Aaron and Falkner, Stefan and Hutter, Frank},
+  title    = {Towards automated deep learning: Efficient joint neural architecture and hyperparameter search},
+  journal  = {arXiv preprint arXiv:1807.06906},
+  year     = {2018},
+  keywords = {Neural Architectue Search, NAS},
+  url      = {https://arxiv.org/pdf/1807.06906.pdf},
+}
+
+@Article{Pham2018,
+  author   = {Pham, Hieu and Guan, Melody Y and Zoph, Barret and Le, Quoc V and Dean, Jeff},
+  title    = {Efficient neural architecture search via parameter sharing},
+  journal  = {arXiv preprint arXiv:1802.03268},
+  year     = {2018},
+  keywords = {NAS, Neural Architecture Search, weight sharing, reinforcement learning},
+  url      = {https://arxiv.org/pdf/1802.03268.pdf},
+}
+
+@Article{Zoph2016,
+  author   = {Zoph, Barret and Le, Quoc V},
+  title    = {Neural architecture search with reinforcement learning},
+  journal  = {arXiv preprint arXiv:1611.01578},
+  year     = {2016},
+  keywords = {Neural Architecture Search, NAS, Reinforcement learning},
+  url      = {https://arxiv.org/pdf/1611.01578.pdf},
+}
+
+@Article{Snoek2012,
+  author        = {Jasper Snoek and Hugo Larochelle and Ryan P. Adams},
+  title         = {Practical Bayesian Optimization of Machine Learning Algorithms},
+  __markedentry = {[djustus:]},
+  abstract      = {Machine learning algorithms frequently require careful tuning of model hyperparameters, regularization terms, and optimization parameters. Unfortunately, this tuning is often a "black art" that requires expert experience, unwritten rules of thumb, or sometimes brute-force search. Much more appealing is the idea of developing automatic approaches which can optimize the performance of a given learning algorithm to the task at hand. In this work, we consider the automatic tuning problem within the framework of Bayesian optimization, in which a learning algorithm's generalization performance is modeled as a sample from a Gaussian process (GP). The tractable posterior distribution induced by the GP leads to efficient use of the information gathered by previous experiments, enabling optimal choices about what parameters to try next. Here we show how the effects of the Gaussian process prior and the associated inference procedure can have a large impact on the success or failure of Bayesian optimization. We show that thoughtful choices can lead to results that exceed expert-level performance in tuning machine learning algorithms. We also describe new algorithms that take into account the variable cost (duration) of learning experiments and that can leverage the presence of multiple cores for parallel experimentation. We show that these proposed algorithms improve on previous automatic procedures and can reach or surpass human expert-level optimization on a diverse set of contemporary algorithms including latent Dirichlet allocation, structured SVMs and convolutional neural networks.},
+  date          = {2012-06-13},
+  eprint        = {1206.2944v2},
+  eprintclass   = {stat.ML},
+  eprinttype    = {arXiv},
+  file          = {online:http\://arxiv.org/pdf/1206.2944v2:PDF},
+  keywords      = {stat.ML, cs.LG},
+  url           = {https://papers.nips.cc/paper/4522-practical-bayesian-optimization-of-machine-learning-algorithms.pdf},
+}
+
+@Comment{jabref-meta: databaseType:bibtex;}
+
+@Comment{jabref-meta: grouping:
+0 AllEntriesGroup:;
+1 StaticGroup:Large scale ML\;0\;1\;\;\;\;;
+1 StaticGroup:CNNs\;0\;1\;\;\;\;;
+1 StaticGroup:NLP\;0\;1\;\;\;\;;
+1 StaticGroup:Classic literature\;0\;1\;\;\;\;;
+1 StaticGroup:Reviews\;0\;1\;\;\;\;;
+1 StaticGroup:Benchmarking\;0\;1\;\;\;\;;
+1 StaticGroup:Optimization\;0\;1\;\;\;\;;
+1 StaticGroup:Harware\;0\;1\;\;\;\;;
+}