diff --git a/benchmarks/ex01_xor.nim b/benchmarks/ex01_xor.nim index e815d97b8..ef1e48aac 100644 --- a/benchmarks/ex01_xor.nim +++ b/benchmarks/ex01_xor.nim @@ -1,50 +1,52 @@ import ../src/arraymancer # Learning XOR function with a neural network. - -# Autograd context / neuralnet graph -let ctx = newContext Tensor[float32] -let bsz = 32 # batch size - -let x_train_bool = randomTensor([bsz * 100, 2], 1).astype(bool) -let y_bool = x_train_bool[_,0] xor x_train_bool[_,1] -let x_train = ctx.variable(x_train_bool.astype(float32)) -let y = y_bool.astype(float32) - -# We will build the following network: -# Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss - -let layer_3neurons = ctx.variable( - randomTensor(3, 2, 2.0f) -. 1.0f, - requires_grad = true - ) - -let classifier_layer = ctx.variable( - randomTensor(1, 3, 2.0f) -. 1.0f, - requires_grad = true - ) - -# Stochastic Gradient Descent -let optim = newSGD[float32]( - layer_3neurons, classifier_layer, 0.01f - ) - -# Learning loop -for epoch in 0..10000: - for batch_id in 0..<100: - - # minibatch offset in the Tensor - let offset = batch_id * 32 - let x = x_train[offset ..< offset + 32, _] - let target = y[offset ..< offset + 32, _] - - # Building the network - let n1 = relu linear(x, layer_3neurons) - let n2 = linear(n1, classifier_layer) - let loss = n2.sigmoid_cross_entropy(target) - - # Compute the gradient (i.e. contribution of each parameter to the loss) - loss.backprop() - - # Correct the weights now that we have the gradient information - optim.update() +proc main() = + # Autograd context / neuralnet graph + let ctx = newContext Tensor[float32] + let bsz = 32 # batch size + + let x_train_bool = randomTensor([bsz * 100, 2], 1).astype(bool) + let y_bool = x_train_bool[_,0] xor x_train_bool[_,1] + let x_train = ctx.variable(x_train_bool.astype(float32)) + let y = y_bool.astype(float32) + + # We will build the following network: + # Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss + + let layer_3neurons = ctx.variable( + randomTensor(3, 2, 2.0f) -. 1.0f, + requires_grad = true + ) + + let classifier_layer = ctx.variable( + randomTensor(1, 3, 2.0f) -. 1.0f, + requires_grad = true + ) + + # Stochastic Gradient Descent + let optim = newSGD[float32]( + layer_3neurons, classifier_layer, 0.01f + ) + + # Learning loop + for epoch in 0..10000: + for batch_id in 0..<100: + + # minibatch offset in the Tensor + let offset = batch_id * 32 + let x = x_train[offset ..< offset + 32, _] + let target = y[offset ..< offset + 32, _] + + # Building the network + let n1 = relu linear(x, layer_3neurons) + let n2 = linear(n1, classifier_layer) + let loss = n2.sigmoid_cross_entropy(target) + + # Compute the gradient (i.e. contribution of each parameter to the loss) + loss.backprop() + + # Correct the weights now that we have the gradient information + optim.update() + +main() diff --git a/benchmarks/implementation/softmax_cross_entropy_bench_batch_first.nim b/benchmarks/implementation/softmax_cross_entropy_bench_batch_first.nim index 74047d2b3..4457aeef7 100644 --- a/benchmarks/implementation/softmax_cross_entropy_bench_batch_first.nim +++ b/benchmarks/implementation/softmax_cross_entropy_bench_batch_first.nim @@ -176,6 +176,7 @@ proc softmax_cross_entropy_backward1[T]( result = zeros_like(cached_tensor) + # TODO: nested parallelism has suspect performance for i in 0||(batch_size-1): let (max, sumexp) = cached_tensor[i,_].streaming_max_sumexp diff --git a/benchmarks/implementation/softmax_cross_entropy_bench_batch_last.nim b/benchmarks/implementation/softmax_cross_entropy_bench_batch_last.nim index 54cbd6bc5..acd16871a 100644 --- a/benchmarks/implementation/softmax_cross_entropy_bench_batch_last.nim +++ b/benchmarks/implementation/softmax_cross_entropy_bench_batch_last.nim @@ -176,6 +176,7 @@ proc softmax_cross_entropy_backward1[T]( result = zeros_like(cached_tensor) + # TODO: nested parallelism has suspect performance for i in 0||(batch_size-1): # Can't use OpenMP - SIGSEGV Illegal Address let (max, sumexp) = cached_tensor[_,i].streaming_max_sumexp diff --git a/examples/ex06_shakespeare_generator.nim b/examples/ex06_shakespeare_generator.nim index bf179d590..1cdf1e56b 100644 --- a/examples/ex06_shakespeare_generator.nim +++ b/examples/ex06_shakespeare_generator.nim @@ -56,7 +56,7 @@ const # # ################################################################ -func strToTensor(str: string|TaintedString): Tensor[PrintableIdx] = +proc strToTensor(str: string|TaintedString): Tensor[PrintableIdx] = result = newTensor[PrintableIdx](str.len) # For each x in result, map the corresponding char index diff --git a/src/arraymancer/io/io_image.nim b/src/arraymancer/io/io_image.nim index e998cd83d..3e211f9e0 100644 --- a/src/arraymancer/io/io_image.nim +++ b/src/arraymancer/io/io_image.nim @@ -27,7 +27,7 @@ proc read_image*(filepath: string): Tensor[uint8] = ## Usage example with conversion to [0..1] float: ## .. code:: nim ## let raw_img = read_image('path/to/image.png') - ## let img = raw_img.map_inline: + ## let img = forEach x in raw_img: ## x.float32 / 255.0 var width, height, channels: int diff --git a/src/arraymancer/laser/strided_iteration/foreach_common.nim b/src/arraymancer/laser/strided_iteration/foreach_common.nim index ff46b67dc..06c424040 100644 --- a/src/arraymancer/laser/strided_iteration/foreach_common.nim +++ b/src/arraymancer/laser/strided_iteration/foreach_common.nim @@ -4,14 +4,73 @@ # This file may not be copied, modified, or distributed except according to those terms. import - macros, + std/[macros, strutils], ../compiler_optim_hints -template isVar[T: object](x: T): bool = +template isVar[T](x: T): bool = ## Workaround due to `is` operator not working for `var` ## https://github.com/nim-lang/Nim/issues/9443 compiles(addr(x)) +proc aliasTensor(id: int, tensor: NimNode): tuple[alias: NimNode, isVar: NimNode] = + ## Produce an alias variable for a tensor + ## Supports: + ## - identifiers + ## - dot call for field access + ## - foo[1] for array access + if tensor.kind notin {nnkIdent, nnkSym, nnkDotExpr, nnkBracketExpr, nnkCall}: + error "Expected a variable identifier or field access or array indexing but found \"" & tensor.repr() & + "\". Please assign to a variable first." + + var t = tensor + + # nnkCall handles the annoying typed AST we get in generics + # Call + # OpenSymChoice + # Sym "[]" + # Sym "[]" + # ... + # DotExpr + # Ident "self" + # Ident "first_moments" + # Ident "i" + if tensor.kind == nnkCall: + tensor[0].expectKind nnkOpenSymChoice + tensor[0][0].expectKind nnkSym + doAssert tensor[0][0].eqIdent("[]") + + # Rewrite the AST to untyped + t = nnkBracketExpr.newTree( + tensor[1] + ) + for i in 2 ..< tensor.len: + t.add tensor[i] + + let isVar = block: + # Handle slicing cases like foo[0..<1, 0..<2] + # that do not return `var` but are technically `var` + # if `foo` is var + if t.kind in {nnkDotExpr, nnkBracketExpr}: + let t0 = t[0] + quote do: isVar(`t0`) + else: + quote do: isVar(`t`) + + proc flattenedName(node: NimNode): string = + if node.kind in {nnkDotExpr, nnkBracketExpr}: + result = flattenedName(node[0]) + result &= '_' + result &= flattenedName(node[1]) + elif node.kind in {nnkIdent, nnkSym}: + result = $node + else: + error "Expected a field nameor dot expression or array access but found \"" & + t.repr() + + let alias = flattenedName(t) + + return (newIdentNode($alias & "_alias" & $id & '_'), isVar) + proc initForEach*( params: NimNode, values, aliases, raw_ptrs: var NimNode, @@ -24,7 +83,9 @@ proc initForEach*( var tensors = nnkBracket.newTree() template syntaxError() {.dirty.} = - error "Syntax error: argument " & ($arg.kind).substr(3) & " in position #" & $i & " was unexpected." + error "Syntax error: argument " & + ($arg.kind).substr(3) & " in position #" & $i & " was unexpected." & + "\n " & arg.repr() for i, arg in params: if arg.kind == nnkInfix: @@ -56,15 +117,15 @@ proc initForEach*( aliases_stmt.add newCall(bindSym"withCompilerOptimHints") for i, tensor in tensors: - let alias = newIdentNode($tensor & "_alias" & $i & '_') + let (alias, detectVar) = aliasTensor(i, tensor) aliases.add alias aliases_stmt.add quote do: - when isVar(`tensor`): + when `detectVar`: var `alias`{.align_variable.} = `tensor` else: let `alias`{.align_variable.} = `tensor` - let raw_ptr_i = genSym(nskLet, $tensor & "_raw_data" & $i & '_') + let raw_ptr_i = genSym(nskLet, $alias & "_raw_data" & $i & '_') raw_ptrs_stmt.add quote do: let `raw_ptr_i`{.restrict.} = `alias`.unsafe_raw_offset() raw_ptrs.add raw_ptr_i @@ -74,7 +135,9 @@ proc initForEach*( for i in 1 ..< aliases.len: let alias_i = aliases[i] test_shapes.add quote do: - assert `alias0`.shape == `alias_i`.shape + assert `alias0`.shape == `alias_i`.shape, + "\n " & astToStr(`alias0`) & ".shape is " & $`alias0`.shape & "\n" & + "\n " & astToStr(`alias_i`) & ".shape is " & $`alias_i`.shape & "\n" template stridedVarsSetup*(): untyped {.dirty.} = for i, alias in aliases: diff --git a/src/arraymancer/laser/tensor/initialization.nim b/src/arraymancer/laser/tensor/initialization.nim index 727e70dbc..5d65144ad 100644 --- a/src/arraymancer/laser/tensor/initialization.nim +++ b/src/arraymancer/laser/tensor/initialization.nim @@ -136,14 +136,20 @@ proc copyFromRaw*[T](dst: var Tensor[T], buffer: ptr T, len: Natural) = withCompilerOptimHints() doAssert dst.size == len, "Tensor size and buffer length should be the same" let buf{.restrict.} = cast[ptr UncheckedArray[T]](buffer) - omp_parallel_chunks( - len, chunk_offset, chunk_size, - OMP_MEMORY_BOUND_GRAIN_SIZE * 4): - copyMem( - dst.unsafe_raw_offset[chunk_offset].addr, - buf[chunk_offset].unsafeAddr, - chunk_size * sizeof(T) - ) + # No OpenMP - unexplained overhead https://github.com/mratsim/Arraymancer/issues/485 + # omp_parallel_chunks( + # len, chunk_offset, chunk_size, + # OMP_MEMORY_BOUND_GRAIN_SIZE * 4): + # copyMem( + # dst.unsafe_raw_offset[chunk_offset].addr, + # buf[chunk_offset].unsafeAddr, + # chunk_size * sizeof(T) + # ) + copyMem( + dst.unsafe_raw_offset[0].addr, + buf[0].unsafeAddr, + len * sizeof(T) + ) else: {.fatal: "Only non-ref types and types with trivial destructors can be raw copied.".} @@ -166,14 +172,19 @@ proc setZero*[T](t: var Tensor[T], check_contiguous: static bool = true) = when not (T is KnownSupportsCopyMem): t.storage.raw_buffer.reset() t.storage.raw_buffer.setLen(t.size) - else: - omp_parallel_chunks( - t.size, chunk_offset, chunk_size, - OMP_MEMORY_BOUND_GRAIN_SIZE * 4): - zeroMem( - t.unsafe_raw_offset[chunk_offset].addr, - chunk_size * sizeof(T) - ) + else: # if setZero or newTensor are used in OpenMP parallel regions + # making this parallel will kill performance. + # omp_parallel_chunks( + # t.size, chunk_offset, chunk_size, + # OMP_MEMORY_BOUND_GRAIN_SIZE * 4): + # zeroMem( + # t.unsafe_raw_offset[chunk_offset].addr, + # chunk_size * sizeof(T) + # ) + zeroMem( + t.unsafe_raw_offset[0].addr, + t.size * sizeof(T) + ) proc newTensor*[T](shape: varargs[int]): Tensor[T] = var size: int diff --git a/src/arraymancer/linear_algebra/helpers/least_squares_lapack.nim b/src/arraymancer/linear_algebra/helpers/least_squares_lapack.nim index 4be57d7d8..0f8f2d232 100644 --- a/src/arraymancer/linear_algebra/helpers/least_squares_lapack.nim +++ b/src/arraymancer/linear_algebra/helpers/least_squares_lapack.nim @@ -53,11 +53,13 @@ proc gelsd*[T: SomeFloat]( var b2: Tensor[T] b2.newMatrixUninitColMajor(ldb.int, nrhs.int) - var b2_slice = b2[0 ..< b.shape[0], 0 ..< nrhs] # Workaround because slicing does no produce a var at the moment - apply2_inline(b2_slice, b): - # paste b in b2. - # if b2 is larger, the rest is zeros - y + var b2_slice = b2[0 ..< b.shape[0], 0 ..< nrhs] + if is1d: + b2_slice = b2_slice.squeeze(1) + + forEach ib2 in b2_slice, + ib in b: + ib2 = ib # Temporary sizes const smlsiz = 25'i32 # equal to the maximum size of the subproblems at the bottom of the computation tree diff --git a/src/arraymancer/ml/dimensionality_reduction/pca.nim b/src/arraymancer/ml/dimensionality_reduction/pca.nim index 129b47481..bea769cdd 100644 --- a/src/arraymancer/ml/dimensionality_reduction/pca.nim +++ b/src/arraymancer/ml/dimensionality_reduction/pca.nim @@ -187,8 +187,10 @@ proc pca_detailed*[T: SomeFloat]( # Variance explained by Singular Values let bessel_correction = T(result.n_observations - 1) - result.explained_variance = map_inline(S): - x * x / bessel_correction + result.explained_variance = newTensorUninit[T](S.shape) + forEach ev in result.explained_variance, + s in S: + ev = s*s / bessel_correction # Since we are using SVD truncated to `n_components` we need to # refer back to the original matrix for total variance diff --git a/src/arraymancer/ml/metrics/common_error_functions.nim b/src/arraymancer/ml/metrics/common_error_functions.nim index 504421b97..c1534c606 100644 --- a/src/arraymancer/ml/metrics/common_error_functions.nim +++ b/src/arraymancer/ml/metrics/common_error_functions.nim @@ -22,7 +22,11 @@ proc squared_error*[T](y, y_true: T): T {.inline.} = proc squared_error*[T](y, y_true: Tensor[T]): Tensor[T] {.noInit.} = ## Element-wise squared error for a tensor, |y_true - y| ^2 - result = map2_inline(y_true, y, squared_error(x,y)) + result = newTensorUninit[T](y.shape) + forEach r in result, + testval in y, + truthval in y_true: + r = squared_error(testval,truthval) proc mean_squared_error*[T](y, y_true: Tensor[T]): T = ## Also known as MSE or L2 loss, mean squared error between elements: @@ -48,7 +52,11 @@ proc relative_error*[T](y, y_true: Tensor[T]): Tensor[T] {.noInit.} = ## Normally the relative error is defined as |y_true - x| / |y_true|, ## but here max is used to make it symmetric and to prevent dividing by zero, ## guaranteed to return zero in the case when both values are zero. - result = map2_inline(y, y_true, relative_error(x,y)) + result = newTensorUninit[T](y.shape) + forEach r in result, + testval in y, + truthval in y_true: + r = relative_error(testval,truthval) proc mean_relative_error*[T](y, y_true: Tensor[T]): T = ## Mean relative error for Tensor, mean of the element-wise @@ -67,10 +75,14 @@ proc absolute_error*[T: SomeFloat](y, y_true: T): T {.inline.} = proc absolute_error*[T](y, y_true: Tensor[T]): Tensor[T] {.noInit.} = ## Element-wise absolute error for a tensor - result = map2_inline(y, y_true, y.absolute_error(x)) + result = newTensorUninit[T](y.shape) + forEach r in result, + testval in y, + truthval in y_true: + r = absolute_error(testval,truthval) proc mean_absolute_error*[T](y, y_true: Tensor[T]): T = ## Also known as L1 loss, absolute error between elements: ## sum(|y_true - y|)/m ## where m is the number of elements - result = map2_inline(y, y_true, y.absolute_error(x)).mean() + result = y.absolute_error(y_true).mean() diff --git a/src/arraymancer/nn/loss/mean_square_error_loss.nim b/src/arraymancer/nn/loss/mean_square_error_loss.nim index f59d8730a..1de096e85 100644 --- a/src/arraymancer/nn/loss/mean_square_error_loss.nim +++ b/src/arraymancer/nn/loss/mean_square_error_loss.nim @@ -30,8 +30,10 @@ proc mse_backward_ag[TT](self: MSELoss[TT], payload: Payload[TT]): SmallDiffs[TT # See also Stanford course: http://theory.stanford.edu/~tim/s15/l/l15.pdf result = newDiffs[TT](1) - result[0] = map2_inline(self.cache.value, self.target): - norm * (x - y) + forEach r0 in result[0], + v in self.cache.value, + t in self.target: + r0 = norm * (v - t) proc mse_cache[TT](result: Variable[TT], input: Variable[TT], target: TT) = ## We expect input with shape [batch_size, features] diff --git a/src/arraymancer/nn/optimizers/optimizers.nim b/src/arraymancer/nn/optimizers/optimizers.nim index f5dd2c7bd..a7caf27c0 100644 --- a/src/arraymancer/nn/optimizers/optimizers.nim +++ b/src/arraymancer/nn/optimizers/optimizers.nim @@ -47,8 +47,9 @@ proc update*(self: Sgd) = for v in self.params: # v.value -= learning rate * grad if v.requires_grad: - apply2_inline(v.value, v.grad): - x - self.lr * y + forEach val in v.value, + g in v.grad: + val -= self.lr * g # Zero the gradient v.grad = v.value.zeros_like # TODO "setZero" instead of a new allocation @@ -128,19 +129,23 @@ proc update*(self: var SgdMomentum) = # This implementation of both kinds of momentum follows that of Tensorflow # and closely mirrors the formulation of Sustkever et. al. # Update the moments with the previous update. - apply2_inline(self.moments[i], v.grad): - self.momentum * x - self.lr * y + forEach mi in self.moments[i], + g in v.grad: + mi = self.momentum * mi - self.lr * g # When momentum = 0 this acts identically to SGD without momentum. if self.nesterov: # Update the params with formula Value = value - lr * gradient + momentum * v # where v = - lr * gradient + momentum * moment - apply3_inline(v.value, v.grad, self.moments[i]): - x - self.lr * y + self.momentum * z + forEach val in v.value, + g in v.grad, + mi in self.moments[i]: + val += -(self.lr * g) + self.momentum * mi else: # Update the params with formula Value = value - lr * gradient + momentum * moment - apply2_inline(v.value, self.moments[i]): - x + y + forEach val in v.value, + mi in self.moments[i]: + val += mi # Zero the gradient v.grad = v.value.zeros_like # TODO "setZero" instead of a new allocation @@ -212,19 +217,21 @@ proc update*(self: var Adam) = let v = self.params[i] if v.requires_grad: # Update biaised first moment estimate - apply2_inline(self.first_moments[i], v.grad): - self.beta1 * x + (1 - self.beta1) * y + forEach fmi in self.first_moments[i], g in v.grad: + fmi = self.beta1 * fmi + (1 - self.beta1) * g # Update biaised second moment estimate - apply2_inline(self.second_moments[i], v.grad): - self.beta2 * x + (1 - self.beta2) * y * y + forEach smi in self.second_moments[i], g in v.grad: + smi = self.beta2 * smi + (1 - self.beta2) * g * g # Adjust weight - apply3_inline(v.value, self.first_moments[i], self.second_moments[i]): - x - lr_t * y / (z.sqrt + self.epsilon) + forEach val in v.value, + fmi in self.first_moments[i], + smi in self.second_moments[i]: + val -= lr_t * fmi / (smi.sqrt + self.epsilon) # Zero the gradient v.grad = v.value.zeros_like # TODO "setZero" instead of a new allocation -func optimizerAdam*[M, T]( +proc optimizerAdam*[M, T]( model: M, learning_rate: T = T(0.001), beta1 = T(0.9), beta2 = T(0.999), diff --git a/src/arraymancer/nn_primitives/nnp_activation.nim b/src/arraymancer/nn_primitives/nnp_activation.nim index a9be939b5..7ee75b37e 100644 --- a/src/arraymancer/nn_primitives/nnp_activation.nim +++ b/src/arraymancer/nn_primitives/nnp_activation.nim @@ -29,14 +29,19 @@ proc sigmoid*[T: SomeFloat](t: Tensor[T]): Tensor[T] {.noInit.}= # stable: proc sigmoid_closure(x: T): T = 0.5.T * (tanh(0.5.T * x) + 1.T) - result = map_inline(t): - sigmoid(x) + result = newTensorUninit[T](t.shape) + forEach dst in result, src in t: + dst = sigmoid(src) proc relu*[T](t: Tensor[T]): Tensor[T] {.noInit.}= - t.map_inline max(0.T,x) + result = newTensorUninit[T](t.shape) + forEach dst in result, src in t: + dst = max(0.T, src) proc tanh*[T: SomeFloat](t: Tensor[T]): Tensor[T] {.noInit.}= - t.map_inline tanh(x) + result = newTensorUninit[T](t.shape) + forEach dst in result, src in t: + dst = tanh(src) # ################################################################################################## # In-place forward @@ -46,32 +51,37 @@ proc msigmoid*[T: SomeFloat](t: var Tensor[T]) = ## Note: Canonical sigmoid is not stable for large negative value # stable: proc sigmoid_closure(x: T): T = 0.5.T * (tanh(0.5.T * x) + 1.T) - apply_inline(t): - sigmoid(x) + forEach x in t: + x = sigmoid(x) proc mrelu*[T](t: var Tensor[T]) = - t.apply_inline max(0.T, x) + forEach x in t: + x = max(0.T, x) proc mtanh*[T: SomeFloat](t: var Tensor[T]) = - t.apply_inline tanh(x) + forEach x in t: + x = tanh(x) # ################################################################################################## # Backward proc sigmoid_backward*[T](gradient: Tensor[T], cached_tensor: Tensor[T]): Tensor[T]{.noInit.}= - result = map2_inline(cached_tensor, gradient): - x * (1 - x) * y + result = newTensorUninit[T](gradient.shape) + forEach r in result, c in cached_tensor, g in gradient: + r = c*(1-c) * g proc relu_backward*[T](gradient: Tensor[T], cached_tensor: Tensor[T]): Tensor[T]{.noInit.}= - result = map2_inline(cached_tensor, gradient): - if x <= 0.T: - 0.T + result = newTensorUninit[T](gradient.shape) + forEach r in result, c in cached_tensor, g in gradient: + if c <= 0.T: + r = 0.T else: - y + r = g proc tanh_backward*[T](gradient: Tensor[T], cached_tensor: Tensor[T]): Tensor[T]{.noInit.}= - result = map2_inline(cached_tensor, gradient): - y * (1 - x * x) + result = newTensorUninit[T](gradient.shape) + forEach r in result, c in cached_tensor, g in gradient: + r = g * (1 - c * c) # #################################################################################################### # Documentation diff --git a/src/arraymancer/nn_primitives/nnp_gru.nim b/src/arraymancer/nn_primitives/nnp_gru.nim index 1453081eb..e154e5717 100644 --- a/src/arraymancer/nn_primitives/nnp_gru.nim +++ b/src/arraymancer/nn_primitives/nnp_gru.nim @@ -63,7 +63,6 @@ proc gru_cell_inference*[T: SomeFloat]( # Slices sr = (0 ..< H)|1 sz = (H ..< 2*H)|1 - srz = (0 ..< 2*H)|1 s = (2*H ..< 3*H)|1 @@ -73,19 +72,29 @@ proc gru_cell_inference*[T: SomeFloat]( linear(input, W3, bW3, W3x) linear(hidden, U3, bU3, U3h) - # Step 2 - Computing reset (r) and update (z) gate - var W2ru = W3x[_, srz] # shape [batch_size, 2*H] - we reuse the previous buffer - apply2_inline(W2ru, U3h[_, srz]): - sigmoid(x + y) - - # Step 3 - Computing candidate hidden state ñ - var n = W3x[_, s] # shape [batch_size, H] - we reuse the previous buffer - apply3_inline(n, W2ru[_, sr], U3h[_, s]): - tanh(x + y * z) - - # Step 4 - Update the hidden state - apply3_inline(hidden, W3x[_, sz], n): - (1 - y) * z + y * x + # Step 2 - Fused evaluation of the 4 GRU equations + # r = σ(Wr * x + bWr + Ur * h + bUr) + # z = σ(Wz * x + bWz + Uz * h + bUz) + # n = tanh(W * x + bW + r *. (U * h + bU )) + # h' = (1 - z) *. n + z *. h + + # shape [batch_size, H] - we reuse the previous buffers + forEach wrx in W3x[_, sr], # Wr*x + wzx in W3x[_, sz], # Wz*x + wx in W3x[_, s], # W*x + urh in U3h[_, sr], # Ur*h + uzh in U3h[_, sz], # Uz*h + uh in U3h[_, s], # U*h + h in hidden: # hidden state + # Reset (r) gate and Update (z) gate + let r = sigmoid(wrx + urh) + let z = sigmoid(wzx + uzh) + + # Candidate hidden state ñ + let n = tanh(wx + r * uh) + + # h' = (1 - z) *. ñ + z *. h + h = (1-z) * n + z*h proc gru_cell_forward*[T: SomeFloat]( input, @@ -124,26 +133,38 @@ proc gru_cell_forward*[T: SomeFloat]( linear(input, W3, bW3, W3x) linear(hidden, U3, bU3, U3h) - # # Saving for backprop - apply2_inline(Uh, U3h[_, s]): - y - - # Step 2 - Computing reset (r) and update (z) gate - apply3_inline(r, W3x[_, sr], U3h[_, sr]): - sigmoid(y + z) - - apply3_inline(z, W3x[_, sz], U3h[_, sz]): - sigmoid(y + z) - - # Step 3 - Computing candidate hidden state ñ - # TODO: need apply4 / loopfusion for efficient - # buffer passing in Stacked GRU implementation - n = map3_inline(W3x[_, s], r, U3h[_, s]): - tanh(x + y * z) - - # Step 4 - Update the hidden state - apply3_inline(hidden, z, n): - (1 - y) * z + y * x + # Step 2 - Fused evaluation of the 4 GRU equations + # and saving for backprop + # r = σ(Wr * x + bWr + Ur * h + bUr) + # z = σ(Wz * x + bWz + Uz * h + bUz) + # n = tanh(W * x + bW + r *. (U * h + bU )) + # h' = (1 - z) *. n + z *. h + + # shape [batch_size, H] - we reuse the previous buffers + forEach wrx in W3x[_, sr], # Wr*x + wzx in W3x[_, sz], # Wz*x + wx in W3x[_, s], # W*x + urh in U3h[_, sr], # Ur*h + uzh in U3h[_, sz], # Uz*h + uh in U3h[_, s], # U*h + h in hidden, # hidden state + saveUh in Uh, # U*h cache for backprop + reset in r, # reset gate cache for backprop + update in z, # update gate cache for backprop + candidate in n: # candidate hidden state cache for backprop + + # Cache for backprop + saveUh = uh + + # Reset (r) gate and Update (z) gate + reset = sigmoid(wrx + urh) + update = sigmoid(wzx + uzh) + + # Candidate hidden state ñ + candidate = tanh(wx + reset * uh) + + # h' = (1 - z) *. ñ + z *. h + h = (1-update) * candidate + update*h proc gru_cell_backward*[T: SomeFloat]( dx, dh, dW3, dU3, # input and weights gradients @@ -162,6 +183,9 @@ proc gru_cell_backward*[T: SomeFloat]( ## - dnext: gradient flowing back from the next layer ## - x, h, W3, U3: inputs saved from the forward pass ## - r, z, n, Uh: intermediate results saved from the forward pass of shape [batch_size, hidden_size] + + # TODO: fused backprop with forEach + # Backprop of step 4 - z part let dz = (h - n) *. dnext let dn = (1.0.T -. z) *. dnext @@ -188,8 +212,8 @@ proc gru_cell_backward*[T: SomeFloat]( linear_backward(h, U3, dU3h, dh, dU3, dbU3) # Backprop of step 4 - h part - apply3_inline(dh, dnext, z): - x + y * z + forEach dhi in dh, dni in dnext, zi in z: + dhi += dni * zi proc gru_inference*[T: SomeFloat]( input: Tensor[T], @@ -351,10 +375,6 @@ proc gru_forward*[T: SomeFloat]( n_lts = ns[layer, timestep, _, _].squeeze(0).squeeze(0) Uh_lts = Uhs[layer, timestep, _, _].squeeze(0).squeeze(0) - # TODO: gru_cell_forward will detach `nl`` - # due to a missing apply4/loop-fusion operation - var n_tmp = n_lts - let input_ts = block: if layer == 0: input[timestep, _, _].squeeze(0) @@ -364,14 +384,10 @@ proc gru_forward*[T: SomeFloat]( gru_cell_forward( input_ts, W3l, U3l, bW3l, bU3l, - r_lts, z_lts, n_tmp, Uh_lts, + r_lts, z_lts, n_lts, Uh_lts, hiddenl ) output[timestep, _, _] = hiddenl.unsqueeze(0) - # TODO: apply/loop-fusion - # copy n_tmpl back to nl - apply2_inline(n_lts, n_tmp): - y proc gru_backward*[T: SomeFloat]( dInput, dHidden0, # Input and starting hidden state gradient diff --git a/src/arraymancer/nn_primitives/nnp_linear.nim b/src/arraymancer/nn_primitives/nnp_linear.nim index 098a62bda..e3f211d6b 100644 --- a/src/arraymancer/nn_primitives/nnp_linear.nim +++ b/src/arraymancer/nn_primitives/nnp_linear.nim @@ -24,7 +24,6 @@ proc linear*[T](input, weight: Tensor[T], bias: Tensor[T], output: var Tensor[T] # - bias tensor shape [1, out_features] # Output does not need to be initialized to 0 or the proper shape, data will be overwritten # Output is: Y = x * W.transpose + b - output = input * weight.transpose # TODO: with the transpose the non-matching rows and cols is confusing output +.= bias diff --git a/src/arraymancer/nn_primitives/nnp_sigmoid_cross_entropy.nim b/src/arraymancer/nn_primitives/nnp_sigmoid_cross_entropy.nim index 74a3bf544..0d8247b58 100644 --- a/src/arraymancer/nn_primitives/nnp_sigmoid_cross_entropy.nim +++ b/src/arraymancer/nn_primitives/nnp_sigmoid_cross_entropy.nim @@ -41,14 +41,20 @@ proc sigmoid_cross_entropy*[T](input, target: Tensor[T]): T = # ln1p(x) does ln(1 + x) but avoids catastrophic cancellation if x << 1. - # result = 0.T - # for xi, ti in zip(input, target): - # result += (-ti * xi + max(xi,0) + ln1p(exp(-abs(xi))) ) / T(input.shape[1]) - - # We need parallel fused map2 -> reduce for all loss functions - result = sum: - map2_inline(input, target): - -y * x + max(x,0) + ln1p(exp(-abs(x))) # This leverage the logsumexp trick to improve numerical stability + result = 0.T + for xi, ti in zip(input, target): + result += (-ti * xi + max(xi,0) + ln1p(exp(-abs(xi))) ) / T(input.shape[1]) + + # TODO - Parallel fused map-reduce, openmp issue - https://github.com/mratsim/Arraymancer/issues/485 + # forEachStaged ii in input, ti in target: + # before_loop: + # var local_sum{.exportc.} = 0.T + # in_loop: + # # This leverage the logsumexp trick to improve numerical stability + # local_sum += -ti * ii + max(ii,0) + ln1p(exp(-abs(ii))) + # after_loop: + # {.emit: "#pragma omp atomic".} + # {.emit: "`result` += `local_sum`;".} # Normalize by batch_size result /= T(batch_size) diff --git a/src/arraymancer/nn_primitives/nnp_softmax.nim b/src/arraymancer/nn_primitives/nnp_softmax.nim index edb54b646..e5f9c1350 100644 --- a/src/arraymancer/nn_primitives/nnp_softmax.nim +++ b/src/arraymancer/nn_primitives/nnp_softmax.nim @@ -29,10 +29,11 @@ proc softmax*[T](input: Tensor[T]): Tensor[T] {.noInit.} = let batch_size = input.shape[0] result = zeros_like(input) + # TODO: the performance of nested parallel regions is highly suspect for i in 0||(batch_size-1): let (max, sumexp) = input[i, _].streaming_max_sumexp var res_slice = result[i, _] - apply2_inline(res_slice, input[i, _]): - stable_softmax(y, max, sumexp) + forEachSerial r in res_slice, inpi in input[i, _]: + r = stable_softmax(inpi, max, sumexp) diff --git a/src/arraymancer/nn_primitives/nnp_softmax_cross_entropy.nim b/src/arraymancer/nn_primitives/nnp_softmax_cross_entropy.nim index 1af1b45af..2b8e3d39f 100644 --- a/src/arraymancer/nn_primitives/nnp_softmax_cross_entropy.nim +++ b/src/arraymancer/nn_primitives/nnp_softmax_cross_entropy.nim @@ -146,13 +146,14 @@ proc softmax_cross_entropy_backward*[T]( result = zeros_like(cached_tensor) + # TODO: nested parallelism has suspect performance for i in 0||(batch_size-1): let (max, sumexp) = cached_tensor[i,_].streaming_max_sumexp var res_slice = result[i,_] - apply3_inline(res_slice, cached_tensor[i,_], target[i,_]): - grad * (stable_softmax(y, max, sumexp) - z) / T(batch_size) + forEachSerial r in res_slice, ci in cached_tensor[i,_], ti in target[i,_]: + r = grad * (stable_softmax(ci, max, sumexp) - ti) / T(batch_size) proc sparse_softmax_cross_entropy_backward*[T; Idx: SomeNumber or byte or char or enum]( @@ -185,13 +186,14 @@ proc sparse_softmax_cross_entropy_backward*[T; Idx: SomeNumber or byte or char o for i, truth_idx in enumerate(target): result[i, int(truth_idx)] = -1 + # TODO: the performance of nested parallel regions is highly suspect for i in 0||(batch_size-1): let (max, sumexp) = cached_tensor[i, _].streaming_max_sumexp var res_slice = result[i, _] - apply2_inline(res_slice, cached_tensor[i, _]): - grad * (stable_softmax(y, max, sumexp) + x) / T(batch_size) + forEachSerial r in res_slice, ci in cached_tensor[i, _]: + r = grad * (stable_softmax(ci, max, sumexp) + r) / T(batch_size) # ################################################ diff --git a/src/arraymancer/stats/kde.nim b/src/arraymancer/stats/kde.nim index 4746ade56..c5f468559 100644 --- a/src/arraymancer/stats/kde.nim +++ b/src/arraymancer/stats/kde.nim @@ -179,7 +179,8 @@ proc kde*[T: SomeNumber; U: int | Tensor[SomeNumber] | openArray[SomeNumber]]( if normalize: let normFactor = 1.0 / (result.sum * (maxT - minT) / nsamples.float) - result.apply_inline(normFactor * x) + forEach x in result: + x *= normFactor proc kde*[T: SomeNumber; U: KernelKind | string; V: int | Tensor[SomeNumber] | openArray[SomeNumber]]( t: Tensor[T], diff --git a/src/arraymancer/tensor.nim b/src/arraymancer/tensor.nim index 7cdf33708..f106e584e 100644 --- a/src/arraymancer/tensor.nim +++ b/src/arraymancer/tensor.nim @@ -17,6 +17,7 @@ import nimblas export OrderType import ./laser/dynamic_stack_arrays, + ./laser/strided_iteration/foreach, ./tensor/data_structure, ./tensor/init_cpu, ./tensor/init_copy_cpu, @@ -45,6 +46,7 @@ import ./laser/dynamic_stack_arrays, ./tensor/exporting export dynamic_stack_arrays, + foreach, data_structure, init_cpu, init_copy_cpu, diff --git a/src/arraymancer/tensor/higher_order_applymap.nim b/src/arraymancer/tensor/higher_order_applymap.nim index 061ce6dcd..e4c81bd17 100644 --- a/src/arraymancer/tensor/higher_order_applymap.nim +++ b/src/arraymancer/tensor/higher_order_applymap.nim @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import ./backend/openmp, +import ../laser/strided_iteration/[foreach, foreach_staged], + ./backend/openmp, ./private/p_checks, ./data_structure, ./init_cpu, ./accessors, ./accessors_macros_write +export foreach, foreach_staged import sugar except enumerate # #################################################################### @@ -206,10 +208,8 @@ proc apply*[T: KnownSupportsCopyMem](t: var Tensor[T], f: proc(x:var T)) = ## x += 1 ## a.apply(pluseqone) # Apply the in-place function pluseqone ## ``apply`` is especially useful to do multiple element-wise operations on a tensor in a single loop over the data. - - omp_parallel_blocks(block_offset, block_size, t.size): - for x in t.mitems(block_offset, block_size): - f(x) + forEach x in t: + f(x) proc map2*[T, U; V: KnownSupportsCopyMem](t1: Tensor[T], f: (T,U) -> V, diff --git a/src/arraymancer/tensor/lapack.nim b/src/arraymancer/tensor/lapack.nim index 1414510d8..30acc564e 100644 --- a/src/arraymancer/tensor/lapack.nim +++ b/src/arraymancer/tensor/lapack.nim @@ -17,6 +17,16 @@ import ./data_structure, ./higher_order_applymap proc frobenius_inner_prod*[T](a,b: Tensor[T]): T = - sum: - map2_inline(a,b): - x * y + # sum: + # map2_inline(a,b): + # x * y + forEachStaged ai in a, bi in b: + openmp_config: + use_simd: true + before_loop: + var local_sum{.exportc.} = 0.T + in_loop: + local_sum += ai * bi + after_loop: + {.emit: "#pragma omp atomic".} + {.emit: "`result` += `local_sum`;".} diff --git a/src/arraymancer/tensor/math_functions.nim b/src/arraymancer/tensor/math_functions.nim index 26472fdb1..8169afd89 100644 --- a/src/arraymancer/tensor/math_functions.nim +++ b/src/arraymancer/tensor/math_functions.nim @@ -26,7 +26,9 @@ proc elwise_mul*[T](a, b: Tensor[T]): Tensor[T] {.noInit.} = proc melwise_mul*[T](a: var Tensor[T], b: Tensor[T]) = ## Element-wise multiply - a.apply2_inline(b, x * y) + forEach x in a, + y in b: + x *= y proc elwise_div*[T: Someinteger](a, b: Tensor[T]): Tensor[T] {.noInit.} = ## Element-wise division @@ -38,11 +40,15 @@ proc elwise_div*[T: SomeFloat](a, b: Tensor[T]): Tensor[T] {.noInit.} = proc melwise_div*[T: Someinteger](a: var Tensor[T], b: Tensor[T]) = ## Element-wise division (in-place) - a.apply2_inline(b, x div y) + forEach x in a, + y in b: + x = x div y proc melwise_div*[T: SomeFloat](a: var Tensor[T], b: Tensor[T]) = ## Element-wise division (in-place) - a.apply2_inline(b, x / y) + forEach x in a, + y in b: + x /= y proc reciprocal*[T: SomeFloat](t: Tensor[T]): Tensor[T] {.noInit.} = ## Return a tensor with the reciprocal 1/x of all elements @@ -50,17 +56,21 @@ proc reciprocal*[T: SomeFloat](t: Tensor[T]): Tensor[T] {.noInit.} = proc mreciprocal*[T: SomeFloat](t: var Tensor[T]) = ## Apply the reciprocal 1/x in-place to all elements of the Tensor - t.apply_inline(1.T/x) + forEach x in t: + x = 1.T/x proc reciprocal*[T: Complex[float32] or Complex[float64]](t: Tensor[T]): Tensor[T] {.noInit.} = ## Return a tensor with the reciprocal 1/x of all elements type F = T.T # Get float subtype of Complex[T] - t.map_inline(complex(1.F, 0.F)/x) + result = newTensorUninit[T](t.shape) + forEach x in t, r in result: + r = complex(1.F, 0.F)/x proc mreciprocal*[T: Complex[float32] or Complex[float64]](t: var Tensor[T]) = ## Apply the reciprocal 1/x in-place to all elements of the Tensor type F = T.T # Get float subtype of Complex[T] - t.apply_inline(complex(1.F, 0.F)/x) + forEach x in t: + x = complex(1.F, 0.F)/x proc negate*[T: SomeSignedInt|SomeFloat](t: Tensor[T]): Tensor[T] {.noInit.} = ## Return a tensor with all elements negated (10 -> -10) @@ -68,7 +78,8 @@ proc negate*[T: SomeSignedInt|SomeFloat](t: Tensor[T]): Tensor[T] {.noInit.} = proc mnegate*[T: SomeSignedInt|SomeFloat](t: var Tensor[T]) = ## Negate in-place all elements of the tensor (10 -> -10) - t.apply_inline(-x) + forEach x in t: + x = -x proc `-`*[T: SomeNumber](t: Tensor[T]): Tensor[T] {.noInit.} = ## Negate all values of a Tensor @@ -91,13 +102,15 @@ proc abs*(t: Tensor[Complex[float32]]): Tensor[float32] {.noInit.} = proc mabs*[T](t: var Tensor[T]) = ## Return a Tensor with absolute values of all elements # FIXME: how to inplace convert Tensor[Complex] to Tensor[float] - t.apply_inline(abs(x)) + forEach x in t: + x = abs(x) proc clamp*[T](t: Tensor[T], min, max: T): Tensor[T] {.noInit.} = t.map_inline(clamp(x, min, max)) proc mclamp*[T](t: var Tensor[T], min, max: T) = - t.apply_inline(clamp(x, min, max)) + forEach x in t: + x = clamp(x, min, max) proc square*[T](x: T): T {.inline.} = ## Return x*x diff --git a/src/arraymancer/tensor/operators_blas_l1.nim b/src/arraymancer/tensor/operators_blas_l1.nim index 2dd676399..886653799 100644 --- a/src/arraymancer/tensor/operators_blas_l1.nim +++ b/src/arraymancer/tensor/operators_blas_l1.nim @@ -90,16 +90,19 @@ proc `*=`*[T: SomeNumber|Complex[float32]|Complex[float64]](t: var Tensor[T], a: ## Element-wise multiplication by a scalar (in-place) if t.size == 0: return - t.apply_inline(x * a) + forEach x in t: + x *= a proc `/=`*[T: SomeFloat|Complex[float32]|Complex[float64]](t: var Tensor[T], a: T) = ## Element-wise division by a scalar (in-place) if t.size == 0: return - t.apply_inline(x / a) + forEach x in t: + x /= a proc `/=`*[T: SomeInteger](t: var Tensor[T], a: T) = ## Element-wise division by a scalar (in-place) if t.size == 0: return - t.apply_inline(x div a) + forEach x in t: + x = x div a diff --git a/src/arraymancer/tensor/operators_broadcasted.nim b/src/arraymancer/tensor/operators_broadcasted.nim index 5918c914d..c7f9dfa7a 100644 --- a/src/arraymancer/tensor/operators_broadcasted.nim +++ b/src/arraymancer/tensor/operators_broadcasted.nim @@ -145,31 +145,36 @@ proc `+.=`*[T: SomeNumber|Complex[float32]|Complex[float64]](t: var Tensor[T], v ## Tensor in-place addition with a broadcasted scalar. if t.size == 0: return - t.apply_inline(x + val) + forEach x in t: + x += val proc `-.=`*[T: SomeNumber|Complex[float32]|Complex[float64]](t: var Tensor[T], val: T) = ## Tensor in-place substraction with a broadcasted scalar. if t.size == 0: return - t.apply_inline(x - val) + forEach x in t: + x -= val proc `^.=`*[T: SomeFloat|Complex[float32]|Complex[float64]](t: var Tensor[T], exponent: T) = ## Compute in-place element-wise exponentiation if t.size == 0: return - t.apply_inline pow(x, exponent) + forEach x in t: + x = x.pow(exponent) proc `*.=`*[T: SomeNumber|Complex[float32]|Complex[float64]](t: var Tensor[T], val: T) = ## Tensor in-place multiplication with a broadcasted scalar. if t.size == 0: return - t.apply_inline(x * val) + forEach x in t: + x *= val proc `/.=`*[T: SomeNumber|Complex[float32]|Complex[float64]](t: var Tensor[T], val: T) = ## Tensor in-place division with a broadcasted scalar. if t.size == 0: return - t.apply_inline(x / val) + forEach x in t: + x /= val # ############################################## diff --git a/tests/manual_checks/optimizers/test_optimizers.nim b/tests/manual_checks/optimizers/test_optimizers.nim index d8dd643ac..8ef0b82fb 100644 --- a/tests/manual_checks/optimizers/test_optimizers.nim +++ b/tests/manual_checks/optimizers/test_optimizers.nim @@ -13,7 +13,7 @@ # limitations under the License. import - ../../src/arraymancer, ../testutils, + ../../../src/arraymancer, ../../testutils, unittest, random, strformat # ############################################################ @@ -25,14 +25,14 @@ import # We use the Rosenbrock function for testing optimizers # https://en.wikipedia.org/wiki/Rosenbrock_function -func rosenbrock[T](x, y: Tensor[T], a: static T = 1, b: static T = 100): Tensor[T] = +proc rosenbrock[T](x, y: Tensor[T], a: static T = 1, b: static T = 100): Tensor[T] = # f(x, y) = (a - x)² + b(y - x²)² result = map2_inline(x, y): let u = a - x let v = y - x*x u * u + b * v * v -func drosenbrock[T](x, y: Tensor[T], a: static T = 1, b: static T = 100): tuple[dx, dy: Tensor[T]] = +proc drosenbrock[T](x, y: Tensor[T], a: static T = 1, b: static T = 100): tuple[dx, dy: Tensor[T]] = result.dx = map2_inline(x, y): 2.T*x - 2.T*a + 4*b*x*x*x - 4*b*x*y result.dy = map2_inline(x, y):