From ebb3f8325ed2af61398bb946657ad093b5dba386 Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mratsim@users.noreply.github.com>
Date: Sun, 23 Dec 2018 21:58:45 +0100
Subject: [PATCH] Arraymancer v0.5.0 (#342)

* Move gru primitive

* remove optimizer from doc

* Update nimdoc.cfg with gru and embedding

* Delete outdated design doc

* Update contributors

* Fill changelog

* Mention Agent Smith and Laser random sampling in the changelog

* changelog: mentionned potential GRU changes depending on CuDNN differences with CPU implementation

* Update README

* Update README

* Update README with history of the release names

* Readme and changelog fixes

* Update changelog

* Nimble package v0.5.0

* Use @ for contributors
---
 Contributors.md                               |  27 +-
 Design_document.md                            | 246 ------------------
 README.md                                     | 171 ++++++------
 arraymancer.nimble                            |  10 +-
 changelog.md                                  | 137 +++++++++-
 nimdoc.cfg                                    |  10 +
 src/nn_primitives/nn_primitives.nim           |   2 +-
 src/nn_primitives/{recurrent => }/nnp_gru.nim |   6 +-
 8 files changed, 253 insertions(+), 356 deletions(-)
 delete mode 100644 Design_document.md
 rename src/nn_primitives/{recurrent => }/nnp_gru.nim (99%)

diff --git a/Contributors.md b/Contributors.md
index 68ba3fd27..69cc5053f 100644
--- a/Contributors.md
+++ b/Contributors.md
@@ -1,6 +1,9 @@
-Arraymancer contributors (sorted alphabetically)
+Main Arraymancer contributors (sorted alphabetically)
 
-### Eduardo Bart
+### Andrea Ferretti (@andreaferetti)
+  - Autograd of mean along an axis
+
+### Eduardo Bart (@edubart)
   - OpenMP
   - Several performance optimizations and fix including
     - Strided iterators
@@ -8,5 +11,21 @@ Arraymancer contributors (sorted alphabetically)
   - Shapeshifting procs
   - Developing the ecosystem with [arraymancer-vision](https://github.com/edubart/arraymancer-vision) and [arraymancer-demos](https://github.com/edubart/arraymancer-demos)
 
-### Mamy Ratsimbazafy
-  - Lead dev
\ No newline at end of file
+### Fabian Keller (@bluenote10)
+  - CSV and toSeq exports
+  - Tensor plotting tool
+  - several fixes
+
+### Mamy Ratsimbazafy (@mratsim)
+  - Lead dev
+
+### Manguluka (@manguluka)
+  - tanh activation
+
+### Xander Johnson (@metasyn)
+  - Kmeans clustering
+  - Automation of MNIST download, caching and reading from compressed gzip
+  - IMDB dataset loader
+
+### Vindaar (@vindaar)
+  - HDF5
diff --git a/Design_document.md b/Design_document.md
deleted file mode 100644
index 8c5253a1c..000000000
--- a/Design_document.md
+++ /dev/null
@@ -1,246 +0,0 @@
-# Design document
-
-This is a notepad to track ideas, challenges, future work and open issues/limitations of Arraymancer.
-
-OUTDATED: Info is stalled, the design document will be moved to Arraymancer "Under the hood" documentation.
-
-<!-- TOC -->
-
-- [Design document](#design-document)
-  - [Storage convention](#storage-convention)
-  - [Pending issues](#pending-issues)
-  - [Data structure considerations](#data-structure-considerations)
-  - [Memory/Perf considerations](#memoryperf-considerations)
-  - [CUDA considerations](#cuda-considerations)
-  - [Coding-style](#coding-style)
-  - [Future features](#future-features)
-    - [Software features](#software-features)
-    - [Backend/hardware features](#backendhardware-features)
-  - [Ideas rejected](#ideas-rejected)
-    - [Having an unified Tensor type instead of Tensor, CudaTensor, etc.](#having-an-unified-tensor-type-instead-of-tensor-cudatensor-etc)
-    - [Have the rank of the Tensor be part of its type.](#have-the-rank-of-the-tensor-be-part-of-its-type)
-    - [Have the kind of stride (C_contiguous, F_contiguous) be part of its type.](#have-the-kind-of-stride-c_contiguous-f_contiguous-be-part-of-its-type)
-    - [Implement offsets and iterator using pointers.](#implement-offsets-and-iterator-using-pointers)
-    - [Shallow-copy by default:](#shallow-copy-by-default)
-    - [Have polymorphic procs depending on a backend parameter](#have-polymorphic-procs-depending-on-a-backend-parameter)
-  - [Readings](#readings)
-    - [Performance](#performance)
-
-<!-- /TOC -->
-
-## Storage convention
-
-Either C or Fortran contiguous arrays are needed for BLAS optimization for Tensor of Rank 1 or 2
-* C_contiguous: Row Major - Default for CPU. Last index is the fastest changing (columns in 2D, depth in 3D) - Rows (slowest), Columns, Depth (fastest)
-* F_contiguous: Col Major - Default for CUDA. First index is the fastest changing (rows in 2D, depth in 3D) - Rows (fastest), Columns, Depth (slowest)
-* Universal: Any strides
-
-Historically Fortran and all optimized BLAS libraries used column-major layout by default.
-Today Fortran, Matlab, R, Julia, OpenGL and CUDA uses column-major layout.
-On the other hand, C, Python and several deep learning libraries (Numpy, Torch, Caffe) uses row-major layout.
-
-On CPU, Arraymancer follows the C/Python crowd. A practical bonus is that Matrix-Vector multiplication should be faster (we traverse each column in a row then change row --> row change the slowest).
-On CUDA, Arraymancer follows (temporarily) the column-major layout, as many CUBLAS in-place operations expect that layout. Rewrite rules will be used for "cuda()" proc so that, if possible, CudaTensors are initialized directly on the device with column-major layout and don't need conversion.
-Arraymancer will use row-major on CUDA when CUBLAS operations are replaced with custom kernels.
-
-## Pending issues
-* Some slicing syntax does not work inside generic procs: https://github.com/mratsim/Arraymancer/issues/62
-
-## Data structure considerations
-
-* Shape and strides are stored in a 72 bytes stack data structure (64B 8-element array + 8B int64). Cache-line wise (64B on consumer CPUs), this is not the best but stack allocated array are much better than heap-allocated and GC-managed seq (~40% perf diff by switching from seq)
-
-* For now, shallowCopies will be used only in strategic places, for example when we want to mutate the original reference but use another striding scheme in `slicerMut`. Slicing will not return views.
-Contrary to Python, the compiler can do the following optimization:
-  - Copy elision
-  - Move on assignment
-  - Detect if the original Tensor is not used anymore and the copy is unneeded.
-If further no-copy optimizations are needed, move optimization with {call} can be used so the compiler automatically choose a no-copy version if only one reference exists: https://nim-lang.org/docs/manual.html#ast-based-overloading-move-optimization
-In the future Nim will support a more general `=move` operator and destructors. It will carefully be evaluated see Araq's blog post: https://nim-lang.org/araq/destructors.html
-
-## Memory/Perf considerations
-* Current CPUs cache line is 64 bytes. We tried a Tensor data structure at 32 bytes with shape and strides being a seq instead of 8-elem array + actual len: Heap allocation was far too slow. We will probably get further improvement if shape and strides fit each in 64 bytes.
-Using uint is dangerous because if we do tensor[0 - 1] it will rollover to tensor[2^32]. Using int32 is possible but in the future we could expect huge sparse tensors that needs int64 indexing and int (int64) is the default in Nim. Also this might be a way to grab users from this limitation of Numpy: https://github.com/ContinuumIO/anaconda-issues/issues/3823, https://github.com/numpy/numpy/issues/5906
-
-* Most copy operations (from nested arrays/seq, for slice assignments from nested arrays/seq or Tensor) uses iterators and avoid intermediate representation.
-
-* Map: Tensor[T] -> Tensor[string] or Tensor[ref AnyObject] uses a non multithreaded slow path because heap allocation does not work with OpenMP.
-
-* The tensor module is already very optimized regarding memory allocations.
-Manual memory management is probably overkill, Nim GC is already extremely fast.
-
-* Autograd module must be optimized as well as between each batch the batch temporaries are free.
-One way to do that is via a memory pool or a custom allocator (buddy allocator, slab allocator, slub allocator ...). 
-One of the challenge is that since Arraymancer is a dynamic framework, some batch may be bigger or smaller than the other so we can't just reuse the same memory location.
-
-## CUDA considerations
-
-* Reclaiming memory: currently all CudaTensors are created via new + finalizer. The finalizer proc is automatically used after (at a non-deterministic time) the object goes out of scope. In case there are memory leaks, it might be because a CudaTensor wasn't created by new, and so need a `=destroy` destructor proc. Discussions on IRC highlight that finalizer is enough for yglukhov's game engine.
-
-* Allocations on Cuda are much more expensive than on CPU and a custom allocator will be needed. (Memory management is already manual anyway)
-
-* Default to column-major layout (Fortran order).
-Internally CUDA/CuBLAS works with column major layout. Creating CudaTensor column-major by default may avoid temporary transpose allocation.
-
-* Currently CudaTensor are shallow-copied by default.
-From a consistency point of view it would be best if both Tensor and CudaTensor have the same behaviour.
-This is pending Nim improvement on assignment operator overloading and destructors + move optimization implementation
-
-* Async operations
-Operations on CUDA device ("Host -> GPU" copy, additions, substraction, etc) are non-blocking for the host.
-Meaning the program can proceed with CPU computation.
-"GPU -> Host" copy operation is blocking to avoid data races.
-
-In the future, independant operations like A+B and C+D might be scheduled in different Cuda Streams for simultaneous processing.
-
-
-## Coding-style
-* Prefer `when` to `if` for compile-time evaluation
-* Let the compiler do its job:
-    - proc everywhere, `inline` tag to nudge him towards expected optimization (it might choose not to inline after a cost analysis anyway)
-    - template if proc does not work or to access an object field
-    - macro as last resort to manipulate AST tree or rewrite code
-* Readibility, maintainability and performance are very important (in no particular order)
-* Use functional constructs like `map`, `scanr` instead of `for loop` when you don't need side-effects or an iterator
-
-## Future features
-
-### Software features
-
-* Implement a Tensor comprehension macro. It may be able to leverage mitems instead of result[i,j] = alpha * (i - j) * (i + j).
-
-* Implement einsum: https://obilaniu6266h16.wordpress.com/2016/02/04/einstein-summation-in-numpy/
-
-* Automatically pull nnpack for optimized convolution on CPU (Linux/macOS only)
-
-* Provide config files for Cuda, MKL, OpenMP, etc
-
-* BLAS operation fusion: `transpose(A) * B` or `Ax + y` should be fused in one operation.
-
-* Implement a Scalar[T] concept so that regular float are considered as Tensors
-
-* (Needs thinking) Support sparse matrices. There is Magma and CuSparse for GPU. What to use for CPU? Interface should be similar to BLAS and should compile on ARM/embedded devices like Jetson TX1.
-
-* Implement Bayesian neural networks
-
-* Implement Graph neural networks
-
-### Backend/hardware features
-
-* OpenCL, probably via CLBlast
-
-* AMD Rocm. (I don't have any AMD GPU though)
-
-* Javascript backend. Using the Nim compiler directly is difficult, see PR https://github.com/mratsim/Arraymancer/pull/126, we can start with emscripten though.
-
-* Metal Performance Shader backend for iPhone compat (Can we emulate/test this on macOS?)
-
-
-
-## Ideas rejected
-
-### Having an unified Tensor type instead of Tensor, CudaTensor, etc.
-
-Rejected because of maintenance/difficult to debug errors. For example for this data structure:
-
-```Nim
-type
-  Backend* = enum
-    Cpu,
-    Cuda
-
-  Tensor*[B: static[Backend]; T] = object
-    shape: seq[int]
-    strides: seq[int]
-    offset: int
-    when B == Backend.Cpu:
-      data: seq[T]
-    else:
-      data_ptr: ptr T
-
-template shape*(t: Tensor): seq[int] =
-  t.shape
-```
-
-The template will not compile due to "Cannot generate B", because due to the conditional when, Nim wants B in all proc declaration. The error points to the type declaration and not the proc declaration which makes it a pain to debug.
-
-Furthermore the comparison operator "==" fails with "Cannot generate B" and I found no solution to that.
-
-Also having more independant types will probably be easier for future features (distributed compute, MPI ?).
-
-### Have the rank of the Tensor be part of its type.
-Rejected because impractical for function chaining.
-    Advantage: Dispatch and compatibility checking at compile time (Matrix * Matrix and Matrix * Vec)
-### Have the kind of stride (C_contiguous, F_contiguous) be part of its type.
-Rejected because impractical for function chaining. Furthermore this should not be exposed to the users as it's an implementation detail.
-
-### Implement offsets and iterator using pointers.
-Indexing with a strided array is basically doing a dot product. With a 3x3 matrix, strides are [3,1], in memory, element at position [1,2] will be at 3x1 + 1 x 2 -> 5th position (i.e. we did a dot product)
-
-    > 0 1 2
-    
-    > 3 4 5
-
-    > 6 7 8
-
-After transposition, strides are [1, 3] and matrix shape:
-
-    > 0 3 6
-
-    > 1 4 7
-
-    > 2 5 8
-
-but the corresponding order in memory is still as before transposition. So pointer must jump by 3 twice, then minus 5, then jump by 3 twice, then minus 5. There is probably a mathematical formula behind but it's much easier and less error-prone to do a dot product, especially for high dimensions.
-
-Since we will do a dot product anyway instead of shifting a pointer by a constant, just doing regular array/sequence indexing is better as we get automatic bounds checking, Nim future improvements and it's much easier to copy a Tensor, no need to recalculate the pointer address. We just need a way to provide a pointer to the beginning of the data to BLAS.
-
-Perf note: from a perf point of view, (integer ?) dot product is vectorized on CPU and GPU, the stride seq will stay in cache, so perf is probably bounded by the non-contiguous memory access. Moving a pointer sometimes by x, sometimes by y, sometimes the other way would also be bounded by memory access (provided a correct and probably cumber some implementation)
-
-### Shallow-copy by default:
-Rejected until benchmarked otherwise.
-If further no-copy optimizations are needed, move optimization with {call} can be used so the compiler automatically choose a no-copy version if only one reference exists: https://nim-lang.org/docs/manual.html#ast-based-overloading-move-optimization
-
-For CudaTensor, value semantics will be implemented.
-
-`data` is currently stored in a "seq" that always deep copy on var assignement. It doesn't copy on let assignement.
-
-If slicing shallow-copies by default like Numpy there is a risk of modifying the original array by mistake. Since Nim is compiled we can hope that the compiler detects cases where original tensor is not reused and moves instead of copying. Excellent read on value semantics vs reference semantics: https://akrzemi1.wordpress.com/2012/02/03/value-semantics/, https://juanchopanzacpp.wordpress.com/2014/05/11/want-speed-dont-always-pass-by-value/ and https://definedbehavior.blogspot.fr/2011/08/value-semantics-copy-elision.html. Nim in-depth discussion: https://forum.nim-lang.org/t/2665/1.
-
-References:
- - [Copy semantics](https://forum.nim-lang.org/t/1793/5) "Parameter passing doesn't copy, var x = foo() doesn't copy but moves let x = y doesn't copy but moves, var x = y does copy but I can use shallowCopy instead of = for that."
- - [Another](https://forum.nim-lang.org/t/1543) "First, it's important to understand that most of the time, you won't need shallowCopy at all. Copying is shallow by default if (1) the left-hand side of an assignment is a let variable or (2) the right-hand side is a function call."
-
-Also using seq is far easier than implementing my own shallowCopy / refcounting code which would introduce the following questions:
-- How to make sure we can modify in-place if shallow copy is allowed or a ref seq/object is used?
-- To avoid reference counting, would it be better to always copy-on-write, in that case wouldn't it be better to pay the cost upfront on assignment?
-- How hard will it be to maintain Arraymancer and avoid bugs because a copy-on-write was missed.
-
-    From Scipy: https://docs.scipy.org/doc/numpy/user/c-info.how-to-extend.html#reference-counting
-
-    "If you mis-handle reference counts you can get problems from memory-leaks to segmentation faults.  The only strategy I know of to handle reference counts correctly is blood, sweat, and tears."
-Nim GC perf: https://gist.github.com/dom96/77b32e36b62377b2e7cadf09575b8883
-
-In-depth [read](http://blog.stablekernel.com/when-to-use-value-types-and-reference-types-in-swift) (for Swift but applicable): performance, safety, usability
-
-
-### Have polymorphic procs depending on a backend parameter
-With the [following commit](https://github.com/mratsim/Arraymancer/blob/260386da01c9185f551f8afbe41d2c4beeeee92c/src/arraymancer/init_common.nim) in Cuda branch, all init procs accepted a backend parameter (Cpu, Cuda, ...). In case the backend had dedicated function like "zeros", this would avoid having to create tensor on the Cpu and then copy it to the backend.
-The downside is
-- Complicating the procs by the use of untyped templates, auto return types, "when t is Tensor" or "when backend is Cpu". This might promote spaghetti code.
-- All new backends would require modification to the base procs, with more risks of introducing new bugs.
-- In the case of "init" function, it requires the `check_nested_elements` proc in a file, then __Cpu__ and __Cuda__ specific code in another, then a __common__ file with the polymorphic procs. This would make it difficult to understand and contribute to the code.
-- Only a few init functions can be used directly on GPU, **ones** and **randomTensor** will require creation on Cpu backend anyway
-Two alternatives are possible to avoid that:
-- Only provide the base proc for Cpu and have a rewrite rule to transform zeros(...).toCuda() into the direct Cuda function if it exists. (aka Composition)
-- Use qualified import, like `ìmport arraymancer as arc` and `ìmport arraymancer/cuda as cu` and then `arc.zeros` or `cu.zeros`
-
-## Readings
-
-### Performance
-- Compute-bound, memory-bound and IO-bound optimization: http://leto.net/docs/C-optimization.php
-- Implementing matmul from scratch: http://apfel.mathematik.uni-ulm.de/~lehn/ulmBLAS/
-- Implementing matmul in Nvidia assembler from scratch: https://github.com/NervanaSystems/maxas/wiki/SGEMM
-- In-depth discussion on fast convolution (NCHW vs CHNW representation, Winograd kernel): https://github.com/soumith/convnet-benchmarks/issues/93
-- Roofline performance model, arithmetic intensity - CPU-bound vs memory-bound: https://crd.lbl.gov/departments/computer-science/PAR/research/roofline/
-
diff --git a/README.md b/README.md
index 75ed8fb2c..57f17b9de 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,15 @@
 
 # Arraymancer - A n-dimensional tensor (ndarray) library.
 
-Arraymancer is a tensor (N-dimensional array) project in Nim. The main focus is providing a fast and ergonomic CPU, Cuda and OpenCL ndarray library on which to build a scientific computing and in particular a deep learning ecosystem.
+Arraymancer is a tensor (N-dimensional array) project in Nim. The main focus is providing a fast and ergonomic CPU, Cuda and OpenCL ndarray library on which to build a scientific computing ecosystem.
 
-The library is inspired by Numpy and PyTorch. The library provides ergonomics very similar to Numpy, Julia and Matlab but is fully parallel and significantly faster than those libraries. It is also faster than C-based Torch.
+The library is inspired by Numpy and PyTorch and targets the following use-cases:
+  - N-dimensional arrays (tensors) for numerical computing
+  - machine learning algorithms (as in Scikit-learn: least squares solvers, PCA and dimensionality reduction, classifiers, regressors and clustering algorithms, cross-validation).
+  - deep learning
+
+The ndarray component can be used without the machine learning and deep learning component.
+It can also use the OpenMP, Cuda or OpenCL backends.
 
 Note: While Nim is compiled and does not offer an interactive REPL yet (like Jupyter), it allows much faster prototyping than C++ due to extremely fast compilation times. Arraymancer compiles in about 5 seconds on my dual-core MacBook.
 
@@ -156,6 +162,31 @@ for t in 0 ..< 500:
   optim.update()
 ```
 
+### Teaser A text generated with Arraymancer's recurrent neural network
+
+From [example 6](./examples/ex06_shakespeare_generator.nim).
+
+Trained 45 min on my laptop CPU on Shakespeare and producing 4000 characters
+
+```
+Whter!
+Take's servant seal'd, making uponweed but rascally guess-boot,
+Bare them be that been all ingal to me;
+Your play to the see's wife the wrong-pars
+With child of queer wretchless dreadful cold
+Cursters will how your part? I prince!
+This is time not in a without a tands:
+You are but foul to this.
+I talk and fellows break my revenges, so, and of the hisod
+As you lords them or trues salt of the poort.
+
+ROMEO:
+Thou hast facted to keep thee, and am speak
+Of them; she's murder'd of your galla?
+
+# [...] See example 6 for full text generation samples
+```
+
 ## Table of Contents
 <!-- TOC -->
 
@@ -165,6 +196,7 @@ for t in 0 ..< 500:
     - [Reshaping and concatenation](#reshaping-and-concatenation)
     - [Broadcasting](#broadcasting)
     - [A simple two layers neural network](#a-simple-two-layers-neural-network)
+    - [Teaser A text generated with Arraymancer's recurrent neural network](#teaser-a-text-generated-with-arraymancers-recurrent-neural-network)
   - [Table of Contents](#table-of-contents)
   - [Installation](#installation)
   - [Full documentation](#full-documentation)
@@ -174,16 +206,13 @@ for t in 0 ..< 500:
       - [Handwritten digit recognition with convolutions](#handwritten-digit-recognition-with-convolutions)
       - [Sequence classification with stacked Recurrent Neural Networks](#sequence-classification-with-stacked-recurrent-neural-networks)
     - [Tensors on CPU, on Cuda and OpenCL](#tensors-on-cpu-on-cuda-and-opencl)
-    - [Speed](#speed)
-      - [Micro benchmark: Int64 matrix multiplication (October 2017)](#micro-benchmark-int64-matrix-multiplication-october-2017)
-      - [Logistic regression (October 2017)](#logistic-regression-october-2017)
-      - [DNN - 3 hidden layers (October 2017)](#dnn---3-hidden-layers-october-2017)
+  - [What's new in Arraymancer v0.5.0 - "Sign of the Unicorn" - December 2018](#whats-new-in-arraymancer-v050---sign-of-the-unicorn---december-2018)
   - [4 reasons why Arraymancer](#4-reasons-why-arraymancer)
     - [The Python community is struggling to bring Numpy up-to-speed](#the-python-community-is-struggling-to-bring-numpy-up-to-speed)
     - [A researcher workflow is a fight against inefficiencies](#a-researcher-workflow-is-a-fight-against-inefficiencies)
     - [Can be distributed almost dependency free](#can-be-distributed-almost-dependency-free)
     - [Bridging the gap between deep learning research and production](#bridging-the-gap-between-deep-learning-research-and-production)
-    - [So why Arraymancer ?](#so-why-arraymancer)
+    - [So why Arraymancer ?](#so-why-arraymancer-)
   - [Future ambitions](#future-ambitions)
 
 <!-- /TOC -->
@@ -216,9 +245,9 @@ For now Arraymancer is mostly at the multidimensional array stage, in particular
 - No need to worry about "vectorized" operations.
 - Broadcasting support. Unlike Numpy it is explicit, you just need to use `.+` instead of `+`.
 - Plenty of reshaping operations: concat, reshape, split, chunk, permute, transpose.
-- Supports tensors of up to 7 dimensions for example a stack of 4 3D RGB minifilms of 10 seconds would be 6 dimensions:
+- Supports tensors of up to 6 dimensions. For example a stack of 4 3D RGB minifilms of 10 seconds would be 6 dimensions:
   `[4, 10, 3, 64, 1920, 1080]` for `[nb_movies, time, colors, depth, height, width]`
-- Can read and write .csv and Numpy (.npy) files. [HDF5 support](https://github.com/mratsim/Arraymancer/pull/257) coming soon.
+- Can read and write .csv, Numpy (.npy) and HDF5 files.
 - OpenCL and Cuda backed tensors (not as feature packed as CPU tensors at the moment).
 - Covariance matrices.
 - Eigenvalues and Eigenvectors decomposition.
@@ -345,14 +374,14 @@ let exam = ctx.variable([
 # ...
 echo answer.unsqueeze(1)
 # Tensor[ex05_sequence_classification_GRU.SeqKind] of shape [8, 1] of type "SeqKind" on backend "Cpu"
-#         Increasing|
-#         Increasing|
-#         Increasing|
-#         NonMonotonic|
-#         NonMonotonic|
-#         Increasing| <----- Wrong!
-#         Decreasing|
-#         Decreasing| <----- Wrong!
+# 	  Increasing|
+# 	  Increasing|
+# 	  Increasing|
+# 	  NonMonotonic|
+# 	  NonMonotonic|
+# 	  Increasing| <----- Wrong!
+# 	  Decreasing|
+# 	  NonMonotonic|
 ```
 
 ### Tensors on CPU, on Cuda and OpenCL
@@ -385,85 +414,35 @@ Here is a comparative table of the core features.
 | Squeezing singleton dimension                     | [x]                         | [x]                        | []                         |
 | Slicing + squeezing                               | [x]                         | []                         | []                         |
 
-### Speed
-
-Arraymancer is fast, how it achieves its speed under the hood is detailed [here](https://mratsim.github.io/Arraymancer/uth.speed.html). Slowness is a bug.
-
-#### Micro benchmark: Int64 matrix multiplication (October 2017)
-
-Integers seem to be the abandoned children of ndarrays and tensors libraries. Everyone is optimising the hell of floating points. Not so with Arraymancer:
-
-```
-Archlinux, E3-1230v5 (Skylake quad-core 3.4 GHz, turbo 3.8)
-Input 1500x1500 random large int64 matrix
-Arraymancer 0.2.90 (master branch 2017-10-10)
-```
-
-| Language | Speed | Memory |
-|---|---|---|
-| Nim 0.17.3 (devel) + OpenMP | **0.36s** | 55.5 MB |
-| Julia v0.6.0 | 3.11s | 207.6 MB |
-| Python 3.6.2 + Numpy 1.12 compiled from source | 8.03s | 58.9 MB |
-
-```
-MacOS + i5-5257U (Broadwell dual-core mobile 2.7GHz, turbo 3.1)
-Input 1500x1500 random large int64 matrix
-Arraymancer 0.2.90 (master branch 2017-10-31)
-
-no OpenMP compilation: nim c -d:native -d:release --out:build/integer_matmul --nimcache:./nimcache benchmarks/integer_matmul.nim
-with OpenMP: nim c -d:openmp --cc:gcc --gcc.exe:"/usr/local/bin/gcc-6" --gcc.linkerexe:"/usr/local/bin/gcc-6"  -d:native -d:release --out:build/integer_matmul --nimcache:./nimcache benchmarks/integer_matmul.nim
-```
-
-| Language | Speed | Memory |
-|---|---|---|
-| Nim 0.18.0 (devel) - GCC 6 + OpenMP | **0.95s** | 71.9 MB |
-| Nim 0.18.0 (devel) - Apple Clang 9 - no OpenMP | **1.73s** | 71.7 MB |
-| Julia v0.6.0 | 4.49s | 185.2 MB |
-| Python 3.5.2 + Numpy 1.12 | 9.49s | 55.8 MB |
-
-Benchmark setup is in the `./benchmarks` folder and similar to (stolen from) [Kostya's](https://github.com/kostya/benchmarks#matmul). Note: Arraymancer float matmul is as fast as `Julia Native Thread`.
-
-#### Logistic regression (October 2017)
-On the [demo benchmark](https://github.com/edubart/arraymancer-demos), Arraymancer is faster than Torch in v0.2.90.
-
-CPU
-
-| Framework | Backend | Forward+Backward Pass Time  |
-|---|---|---|
-| Arraymancer v0.2.90| OpenMP + MKL | **0.458ms**  |
-| Torch7 | MKL | 0.686ms  |
-| Numpy | MKL | 0.723ms  |
-
-GPU
-
-| Framework | Backend | Forward+Backward Pass Time  |
-|---|---|---|
-| Arraymancer v0.2.90| Cuda | WIP  |
-| Torch7 | Cuda | 0.286ms  |
-
-#### DNN - 3 hidden layers (October 2017)
-
-CPU
-
-| Framework | Backend | Forward+Backward Pass Time  |
-|---|---|---|
-| Arraymancer v0.2.90| OpenMP + MKL | **2.907ms**  |
-| PyTorch | MKL | 6.797ms  |
-
-GPU
-
-| Framework | Backend | Forward+Backward Pass Time  |
-|---|---|---|
-| Arraymancer v0.2.90| Cuda | WIP |
-| PyTorch | Cuda | 4.765ms  |
-
-
-```
-Intel(R) Core(TM) i7-3770K CPU @ 3.50GHz, gcc 7.2.0, MKL 2017.17.0.4.4, OpenBLAS 0.2.20, Cuda 8.0.61, Geforce GTX 1080 Ti, Nim 0.18.0
-```
-
-In the future, Arraymancer will leverage Nim compiler to automatically fuse operations
-like `alpha A*B + beta C` or a combination of element-wise operations. This is already done to fuse `toTensor` and `reshape`.
+## What's new in Arraymancer v0.5.0 - "Sign of the Unicorn" - December 2018
+
+> This release is named after "Sign of the Unicorn" (1975), the third book of Roger Zelazny masterpiece "The Chronicles of Amber".
+
+The full changelog is available in [changelog.md](./changelog.md).
+
+Here are the highlights:
+  - Backward incompatible: PCA now returns a tuple of the projected tensor and the principal components. An overloaded PCA can be used with the principal axes supplied by the user.
+  - Datasets:
+    - MNIST is now autodownloaded and cached
+    - Added IMDB Movie Reviews dataset
+  - IO:
+    - Numpy file format support
+    - Image reading and writing support (jpg, bmp, png, tga)
+    - HDF5 reading and writing
+  - Machine learning
+    - Kmeans clustering
+  - Deep Learning
+    - RNN: GRU support including fused stacked GRU layers with sequence/timesteps
+    - Embedding layer with multiple timesteps support. Indexing can be done with integers, byte, chars or enums.
+    - Sparse softmax cross-entropy: the target tensor subtype can now be integers, byte, chars or enums.
+    - Adam optimiser (Adaptative Moment Estimation)
+    - Xavier Glorot, Kaiming He and Yann Lecun weight initialisation schemes
+  - N-D arrays / tensors
+    - Splitting and chunking support
+    - Fancy indexing via `index_select`
+  - End-to-end examples:
+    - Sequence/time-series classification using RNN
+    - Text generation on Shakespeare and Jane Austen's Pride and Prejudice. This can be applied to any text-based dataset (including blog posts, Latex papers and code)
 
 ## 4 reasons why Arraymancer
 
diff --git a/arraymancer.nimble b/arraymancer.nimble
index f527fb6b3..037e159ee 100644
--- a/arraymancer.nimble
+++ b/arraymancer.nimble
@@ -1,5 +1,5 @@
 ### Package
-version       = "0.4.0"
+version       = "0.5.0"
 author        = "Mamy André-Ratsimbazafy"
 description   = "A n-dimensional tensor (ndarray) library"
 license       = "Apache License 2.0"
@@ -222,10 +222,6 @@ task gen_doc, "Generate Arraymancer documentation":
     let modName = filePath[18..^5]
     exec r"nim doc -o:docs/build/nn_optimizers." & modName & ".html " & filePath
 
-  for filePath in listFiles("src/nn/shapeshifting/"):
-    let modName = filePath[21..^5]
-    exec r"nim doc -o:docs/build/nn_optimizers." & modName & ".html " & filePath
-
   for filePath in listFiles("src/nn_dsl/"):
     let modName = filePath[11..^5]
     exec r"nim doc -o:docs/build/nn_dsl." & modName & ".html " & filePath
@@ -238,6 +234,10 @@ task gen_doc, "Generate Arraymancer documentation":
     let modName = filePath[10..^5]
     exec r"nim doc -o:docs/build/stats." & modName & ".html " & filePath
 
+  for filePath in listFiles("src/ml/clustering/"):
+    let modName = filePath[18..^5]
+    exec r"nim doc -o:docs/build/ml." & modName & ".html " & filePath
+
   for filePath in listFiles("src/ml/dimensionality_reduction/"):
     let modName = filePath[32..^5]
     exec r"nim doc -o:docs/build/ml." & modName & ".html " & filePath
diff --git a/changelog.md b/changelog.md
index 4e0c223fd..96864cdbd 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,6 +1,135 @@
+# Arraymancer v0.5.0 Dec. 23 2018 "Sign of the Unicorn"
+
+> This release is named after "Sign of the Unicorn" (1975), the third book of Roger Zelazny masterpiece "The Chronicles of Amber".
+
+Changes affecting backward compatibility:
+  - PCA has been split into 2
+    - The old PCA with input `pca(x: Tensor, nb_components: int)` now returns a tuple
+      of result and principal components tensors in descending order instead of just a result
+    - A new PCA `pca(x: Tensor, principal_axes: Tensor)` will project the input x
+      on the principal axe supplied
+
+Changes:
+  - Datasets:
+    - MNIST is now autodownloaded and cached
+    - Added IMDB Movie Reviews dataset
+  - IO:
+    - Numpy file format support
+    - Image reading and writing support (jpg, bmp, png, tga)
+    - HDF5 reading and writing
+  - Machine learning
+    - Kmeans clustering
+  - Neural network and autograd:
+    - Support substraction, sum and stacking in neural networks
+    - Recurrent NN: GRUCell, GRU and Fused Stacked GRU support
+    - The NN declarative lang now supports GRU
+    - Added Embedding layer with up to 3D input tensors [batch_size, sequence_length, features] or [sequence_length, batch_size, features]. Indexing can be done with any sized integers, byte or chars and enums.
+    - Sparse softmax cross-entropy now supports target tensors with indices of type: any size integers, byte, chars or enums.
+    - Added ADAM optimiser (Adaptative Moment Estimation)
+    - Added Hadamard product backpropagation (Elementwise matrix multiply)
+    - Added Xavier Glorot, Kaiming He and Yann Lecun weight initialisations
+    - The NN declarative lang automatically initialises weights with the following scheme:
+      - Linear and Convolution: Kaiming (suitable for Relu activation)
+      - GRU: Xavier (suitable for the internal tanh and sigmoid)
+      - Embedding: Not supported in declarative lang at the moment
+  - Tensors:
+    - Add tensor splitting and chunking
+    - Fancy indexing via `index_select`
+    - division broadcasting, scalar division and multiplication broadcasting
+    - High-dimensional `toSeq` exports
+  - End-to-end Examples:
+    - Sequence/mini time-series classification example using RNN
+    - Training and text generation example with Shakespeare and Jane Austen work. This can be applied to any text-based dataset (including blog posts, Latex papers and code). It should contain at least 700k characters (0.7 MB), this is considered small already.
+
+- Important fixes:
+  - Convolution shape inference on non-unit strided convolutions
+  - Support the future OpenMP changes from nim#devel
+  - GRU: inference was squeezing all singleton dimensions instead of just the "layer" dimension.
+  - Autograd: remove pointers to avoid pointing to wrong memory when the garbage collector moves it under pressure. This unfortunately comes at the cost of more GC pressure, this will be addressed in the future.
+  - Autograd: remove all methods. They caused issues with generic instantiation and object variants.
+
+Special thanks to [@metasyn](https://github.com/metasyn) (MNIST caching, IMDB dataset, Kmeans) and [@Vindaar](https://github.com/vindaar) (HDF5 support and the example of using Arraymancer + Plot.ly) for their large contributions on this release.
+
+Ecosystem:
+  - Using Arraymancer + Plotly for NN training visualisation:
+    https://github.com/Vindaar/NeuralNetworkLiveDemo
+    ![](https://github.com/Vindaar/NeuralNetworkLiveDemo/raw/master/media/demo.gif)
+  - [Monocle](https://github.com/numforge/monocle), proof-of-concept data visualisation in Nim using [Vega](http://vega.github.io/). Hopefully allowing this kind of visualisation in the future:
+
+    ![](https://vega.github.io/images/vega-lite.png)
+    ![](https://vega.github.io/images/vg.png)
+
+    and compatibility with the Vega ecosystem, especially the Tableau-like [Voyager](https://github.com/vega/voyager).
+  - [Agent Smith](https://github.com/numforge/agent-smith), reinforcement learning framework.
+    Currently it wraps the `Arcade Learning Environment` for practicing reinforcement learning on Atari games.
+    In the future it will wrap Starcraft 2 AI bindings
+    and provides a high-level interface and examples to reinforcement learning algorithms.
+  - [Laser](https://github.com/numforge/laser), the future Arraymancer backend
+    which provides:
+      - SIMD intrinsics
+      - OpenMP templates with fine-grained control
+      - Runtime CPU features detection for ARM and x86
+      - A proof-of-concept JIT Assembler
+      - A raw minimal tensor type which can work as a view to arbitrary buffers
+      - Loop fusion macros for iteration on an arbitrary number of tensors.
+        As far as I know it should provide the fastest multi-threaded
+        iteration scheme on strided tensors all languages and libraries included.
+      - Optimized reductions, exponential and logarithm functions reaching
+        4x to 10x the speed of naively compiled for loops
+      - Optimised parallel strided matrix multiplication reaching 98% of OpenBLAS performance
+        - This is a generic implementation that can also be used for integers
+        - It will support preprocessing (relu_backward, tanh_backward, sigmoid_backward)
+          and epilogue (relu, tanh, sigmoid, bias addition) operation fusion
+          to avoid looping an extra time with a memory bandwidth bound pass.
+      - Convolutions will be optimised with a preprocessing pass fused into matrix multiplication. Traditional `im2col` solutions can only reach 16% of matrix multiplication efficiency on the common deep learning filter sizes
+      - State-of-the art random distributions and random sampling implementations
+        for stochastic algorithms, text generation and reinforcement learning.
+
+Future breaking changes.
+
+1.  Arraymancer backend will switch to `Laser` for next version.
+    Impact:
+      - At a low-level CPU tensors will become a view on top of a pointer+len
+        fon old data types instead of using the default Nim seqs. This will enable plenty of no-copy use cases
+        and even using memory-mapped tensors for out-of-core processing.
+        However libraries relying on teh very low-level representation of tensors will break.
+        The future [type is already implemented in Laser](https://github.com/numforge/laser/blob/553497e1193725522ab7a5540ed824509424992f/laser/tensor/datatypes.nim#L12-L30).
+      - Tensors of GC-allocated types like seq, string and references will keep using Nim seqs.
+      - While it was possible to use the Javascript backend by modifying the iteration scheme
+        this will not be possible at all. Use JS->C FFI or WebAssembly compilation instead.
+      - The inline iteration **templates** `map_inline`, `map2_inline`, `map3_inline`, `apply_inline`, `apply2_inline`, `apply3_inline`, `reduce_inline`, `fold_inline`, `fold_axis_inline` will be removed and replace by `forEach` and `forEachStaged` with the following syntax:
+      ```Nim
+      forEach x in a, y in b, z in c:
+        x += y * z
+      ```
+      Both will work with an arbitrary number of tensors and will generate 2x to 3x more compact code wile being about 30% more efficient for strided iteration. Furthermore `forEachStaged` will allow precise control of the parallelisation strategy including pre-loop and post-loop synchronisation with thread-local variables, locks, atomics and barriers.
+      The existing higer-order **functions** `map`, `map2`, `apply`, `apply2`, `fold`, `reduce` will not be impacted. For small inlinable functions it will be recommended to use the `forEach` macro to remove function call overhead (Yyou can't inline a proc parameter).
+
+2. The neural network domain specific language will use less magic
+    for the `forward` proc.
+    Currently the neural net domain specific language only allows the type
+    `Variable[T]` for inputs and the result.
+    This prevents its use with embedding layers which also requires an index input.
+    Furthermore this prevents using `tuple[output, hidden: Variable]` result type
+    which is very useful to pass RNNs hidden state for generative neural networks (for example text sequence or time-series).
+    So unfortunately the syntax will go from the current
+    `forward x, y:` shortcut to classic Nim `proc forward[T](x, y: Variable[T]): Variable[T]`
+
+3. Once CuDNN GRU is implemented, the GRU layer might need some adjustments to give the same results on CPU and Nvidia's GPU and allow using GPU trained weights on CPU and vice-versa.
+
+Thanks:
+  - metasyn: Datasets and Kmeans clustering
+  - vindaar: HDF5 support and Plot.ly demo
+  - bluenote10: toSeq exports
+  - andreaferetti: Adding axis parameter to Mean layer autograd
+  - all the contributors of fixes in code and documentation
+  - the Nim community for the encouragements
+
 Arraymancer v0.4.0 May 05 2018 "The Name of the Wind"
 =====================================================
 
+> This release is named after "The Name of the Wind" (2007), the first book of Patrick Rothfuss masterpiece "The Kingkiller Chronicle".
+
 Changes:
 
 - Core:
@@ -47,6 +176,8 @@ Thanks:
 Arraymancer v0.3.0 Dec. 14 2017 "Wizard's First Rule"
 =====================================================
 
+> This release is named after "Wizard's First Rule" (1994), the first book of Terry Goodkind masterpiece "The Sword of Truth".
+
 I am very excited to announce the third release of Arraymancer which includes numerous improvements, features and (unfortunately!) breaking changes.
 Warning  ⚠: Deprecated ALL procs will be removed next release due to deprecated spam and to reduce maintenance burden.
 
@@ -113,9 +244,11 @@ Changes:
   - All `unsafe` proc are now default and deprecated.
 
 
-Arraymancer v0.2.0 Sept. 24, 2017 "The Color of Magic"
+Arraymancer v0.2.0 Sept. 24, 2017 "The Colour of Magic"
 ======================================================
 
+> This release is named after "The Colour of Magic" (1983), the first book of Terry Pratchett masterpiece "Discworld".
+
 I am very excited to announce the second release of Arraymancer which includes numerous improvements `blablabla` ...
 
 Without further ado:
@@ -154,6 +287,8 @@ Minor revisions v0.1.1 to v0.1.3
 Arraymancer v0.1.0 July 12, 2017 "Magician Apprentice"
 =======================================================
 
+> This release is named after "Magician: Apprentice" (1982), the first book of Raymond E. Feist masterpiece "The Riftwar Cycle".
+
 First public release.
 
 Include:
diff --git a/nimdoc.cfg b/nimdoc.cfg
index e4ccffc37..95f522106 100644
--- a/nimdoc.cfg
+++ b/nimdoc.cfg
@@ -84,6 +84,8 @@ doc.file = """
           <li><a href="nn_activation.sigmoid.html">Activation: Sigmoid</a></li>
           <li><a href="nn_activation.tanh.html">Activation: Tanh</a></li>
           <li><a href="nn_layers.conv2D.html">Layers: Convolution 2D</a></li>
+          <li><a href="nn_layers.embedding.html">Layers: Embedding</a></li>
+          <li><a href="nn_layers.gru.html">Layers: GRU (Gated Linear Unit)</a></li>
           <li><a href="nn_layers.linear.html">Layers: Linear/Dense</a></li>
           <li><a href="nn_layers.maxpool2D.html">Layers: Maxpool 2D</a></li>
           <li><a href="nn_loss.cross_entropy_losses.html">Loss: Cross-Entropy losses</a></li>
@@ -108,6 +110,9 @@ doc.file = """
         <ul class="monospace">
           <li><a href="datasets.mnist.html">MNIST</a></li>
           <li><a href="io.io_csv.html">CSV reading and writing</a></li>
+          <li><a href="io.io_hdf5.html">HDF5 files reading and writing</a></li>
+          <li><a href="io.io_image.html">Images reading and writing</a></li>
+          <li><a href="io.io_npy.html">Numpy files reading and writing</a></li>
         </ul>
       </span>
       <span>
@@ -117,7 +122,10 @@ doc.file = """
           <li><a href="ag.ag_data_structure.html">Data structure</a></li>
           <li><a href="ag.gates_basic.html">Basic operations</a></li>
           <li><a href="ag.gates_blas.html">Linear algebra operations</a></li>
+          <li><a href="ag.gates_hadamard.html">Hadamard product (elementwise matrix multiply)</a></li>
           <li><a href="ag.gates_reduce.html">Reduction operations</a></li>
+          <li><a href="ag.gates_shapeshifting_concat_split.html">Concatenation, stacking, splitting, chunking operations</a></li>
+          <li><a href="ag.gates_shapeshifting_views.html">Linear algebra operations</a></li>
         </ul>
       </span>
       <span>
@@ -126,6 +134,8 @@ doc.file = """
           <li><a href="nnp.nnp_activation.html">Activations</a></li>
           <li><a href="nnp.nnp_convolution.html">Convolution 2D</a></li>
           <li><a href="nnp.nnp_conv2D_cudnn.html">Convolution 2D - CuDNN</a></li>
+          <li><a href="nnp.nnp_embedding.html">Linear / Dense layer</a></li>
+          <li><a href="nnp.nnp_gru.html">Linear / Dense layer</a></li>
           <li><a href="nnp.nnp_linear.html">Linear / Dense layer</a></li>
           <li><a href="nnp.nnp_maxpooling.html">Maxpooling</a></li>
           <li><a href="nnp.nnp_numerical_gradient.html">Numerical gradient</a></li>
diff --git a/src/nn_primitives/nn_primitives.nim b/src/nn_primitives/nn_primitives.nim
index 6b6cf135e..0fc357390 100644
--- a/src/nn_primitives/nn_primitives.nim
+++ b/src/nn_primitives/nn_primitives.nim
@@ -20,7 +20,7 @@ import  ./nnp_activation,
         ./nnp_maxpooling,
         ./nnp_softmax,
         ./nnp_numerical_gradient,
-        ./recurrent/nnp_gru,
+        ./nnp_gru,
         ./nnp_embedding.nim
 
 export  nnp_activation,
diff --git a/src/nn_primitives/recurrent/nnp_gru.nim b/src/nn_primitives/nnp_gru.nim
similarity index 99%
rename from src/nn_primitives/recurrent/nnp_gru.nim
rename to src/nn_primitives/nnp_gru.nim
index 524b47a98..2a0ffef02 100644
--- a/src/nn_primitives/recurrent/nnp_gru.nim
+++ b/src/nn_primitives/nnp_gru.nim
@@ -3,9 +3,9 @@
 # This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  ../../tensor/tensor,
-  ../private/p_activation, ../nnp_linear,
-  ../nnp_activation
+  ../tensor/tensor,
+  private/p_activation, ./nnp_linear,
+  nnp_activation
 
 # For compatibility with CuDNN and allow loading CPU/Cuda weights interchangeably,
 # we use the following equations,