From 6d32073977fb94c4d4fd043f89836b47d43b21f5 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Tue, 26 Sep 2023 20:55:18 +0200
Subject: [PATCH 01/37] Add files via upload

---
 .../NNlibLoopVectorizationExt.jl              |  10 +
 ext/NNlibLoopVectorizationExt/conv.jl         | 181 +++++++++++
 ext/NNlibLoopVectorizationExt/conv_old.jl     | 304 ++++++++++++++++++
 ext/NNlibLoopVectorizationExt/pooling.jl      | 109 +++++++
 ext/NNlibLoopVectorizationExt/pooling_old.jl  | 201 ++++++++++++
 5 files changed, 805 insertions(+)
 create mode 100644 ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl
 create mode 100644 ext/NNlibLoopVectorizationExt/conv.jl
 create mode 100644 ext/NNlibLoopVectorizationExt/conv_old.jl
 create mode 100644 ext/NNlibLoopVectorizationExt/pooling.jl
 create mode 100644 ext/NNlibLoopVectorizationExt/pooling_old.jl

diff --git a/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl b/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl
new file mode 100644
index 000000000..d2053161c
--- /dev/null
+++ b/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl
@@ -0,0 +1,10 @@
+module NNlibLoopVectorizationExt
+
+using NNlib
+using LoopVectorization
+using Random, Statistics
+
+include("conv.jl")
+include("pooling.jl")
+
+end # module
\ No newline at end of file
diff --git a/ext/NNlibLoopVectorizationExt/conv.jl b/ext/NNlibLoopVectorizationExt/conv.jl
new file mode 100644
index 000000000..3aafe9246
--- /dev/null
+++ b/ext/NNlibLoopVectorizationExt/conv.jl
@@ -0,0 +1,181 @@
+#=
+Accelerated convolution for 2d-images using the power of LoopVectorization.
+The acceleration is usually greatest when the inputs have a large spatial size and few channels. 
+Using stride > 1, dilation > 1 or groups > 1 can slow down things a bit.
+
+Since the current state of LoopVectorization ∇conv_filter! isn't really faster than the 
+original implementation in some situations, it is left out for the moment.
+
+Implementation copied from here (Jonas Steinebach, MIT):
+https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_convolution.jl
+=#
+
+function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
+
+    # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise)
+    size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
+    cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
+    NNlib.check_dims(size(input), size_weight_check_dims, size(output), cdims_check_dims)
+
+    # padding is done naively at the moment
+    if cdims.padding != (0, 0, 0, 0)
+        input = NNlib.pad_zeros(input, cdims.padding, dims=(1, 2))
+    end
+
+    output_width, output_height, _ = size(output)
+    input_width, input_height, in_channels, batches = size(input)
+    weight_width, weight_height, in_channels_weight, out_channels = size(weight)
+
+    # it's necessary to flip the kernel if real convolution is performed (flipkernel=false)
+    if !NNlib.flipkernel(cdims)
+        weight = reverse(weight, dims=(1, 2))
+    end
+
+    groups = cdims.groupcount
+    x_stride, y_stride = cdims.stride
+    x_dilation, y_dilation = cdims.dilation
+    out_channels_per_group = out_channels ÷ groups
+
+    if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
+        # println("forward: very specialized case for maximum performance")
+
+        @tturbo for index_batch in 1:batches
+            for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
+                value = zero(T)
+                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                    value += input[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
+                end
+                output[x_out, y_out, out_channel, index_batch] = value
+            end
+        end
+
+    elseif groups == 1 # second specialized case for better performance
+        # println("forward: second specialized case for better performance")
+
+        @tturbo for index_batch in 1:batches
+            for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
+                m = y_out + (y_stride - 1) * (y_out - 1)
+                n = x_out + (x_stride - 1) * (x_out - 1)
+                value = zero(T)
+                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                    y_in = m + (y_w - 1) * y_dilation
+                    x_in = n + (x_w - 1) * x_dilation
+                    value += input[x_in, y_in, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
+                end
+                output[x_out, y_out, out_channel, index_batch] = value
+            end
+        end
+
+    else # general case for any convolution
+        # println("forward: general case for any convolution")
+
+        @tturbo for index_batch in 1:batches
+            for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in 1:output_height, x_out in 1:output_width
+                m = y_out + (y_stride - 1) * (y_out - 1)
+                n = x_out + (x_stride - 1) * (x_out - 1)
+                out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
+                value = zero(T)
+                for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width
+                    y_in = m + (y_w - 1) * y_dilation
+                    x_in = n + (x_w - 1) * x_dilation
+                    in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
+                    value += input[x_in, y_in, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel]
+                end
+                output[x_out, y_out, out_channel, index_batch] = value
+            end
+        end
+
+    end
+
+    return output
+end
+
+function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
+    
+    # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise)
+    size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
+    cdims_check_dims = DenseConvDims(size(input_gradient), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
+    NNlib.check_dims(size(input_gradient), size_weight_check_dims, size(output_gradient), cdims_check_dims)
+    
+    # storing all the necessary shapes
+    output_width, output_height, out_channels, current_batch_size = size(output_gradient)
+    weight_width, weight_height, in_channels_weight, out_channels = size(weight)
+
+    # because in the actual computation section, values are added, it's saver to reset the given input_gradient first
+    input_gradient .= zero(T)
+    # check if input_gradient must be padded (padding is done naively at the moment)
+    if cdims.padding != (0, 0, 0, 0)
+        input_gradient_padded = NNlib.pad_zeros(input_gradient, cdims.padding, dims=(1, 2))
+    else
+        input_gradient_padded = input_gradient
+    end
+
+    # store the size of input after padding 
+    input_width, input_height, in_channels, current_batch_size = size(input_gradient_padded) # size after padding
+
+    # it's necessary to flip the kernel if real convolution is performed (flipkernel=false)
+    if !NNlib.flipkernel(cdims)
+        weight = reverse(weight, dims=(1, 2))
+    end
+
+    groups = cdims.groupcount
+    x_stride, y_stride = cdims.stride
+    x_dilation, y_dilation = cdims.dilation
+    out_channels_per_group = out_channels ÷ groups
+
+    # actual computation (using @tturbo instead of Threads.@threads + @turbo may end up in wrong results)
+    if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
+        # println("backward: very specialized case for maximum performance")
+
+        Threads.@threads for index_batch in 1:current_batch_size
+            @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
+                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                    input_gradient_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
+                end
+            end
+        end
+
+    elseif groups == 1 # second specialized case for better performance
+        # println("backward: second specialized case for better performance")
+
+        Threads.@threads for index_batch in 1:current_batch_size
+            @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
+                m = y_out + (y_stride - 1) * (y_out - 1)
+                n = x_out + (x_stride - 1) * (x_out - 1)
+                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                    y_in = m + (y_w - 1) * y_dilation
+                    x_in = n + (x_w - 1) * x_dilation
+                    input_gradient_padded[x_in, y_in, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
+                end
+            end
+        end
+
+    else # general case for any convolution 
+        # println("backward: general case for any convolution")
+
+        Threads.@threads for index_batch in 1:current_batch_size
+            for out_channel_per_group in 1:out_channels_per_group # putting @turbo here may end up in wrong results
+                @turbo for group in 1:groups, y_out in 1:output_height, x_out in 1:output_width
+                    m = y_out + (y_stride - 1) * (y_out - 1)
+                    n = x_out + (x_stride - 1) * (x_out - 1)
+                    out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
+                    for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width
+                        y_in = m + (y_w - 1) * y_dilation
+                        x_in = n + (x_w - 1) * x_dilation
+                        in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
+                        input_gradient_padded[x_in, y_in, in_channel_input, index_batch] += weight[x_w, y_w, in_channel_weight, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
+                    end
+                end
+            end
+        end
+
+    end
+
+    # depad 
+    if cdims.padding != (0, 0, 0, 0)
+        x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding
+        input_gradient .= input_gradient_padded[x_pad1+1:input_width-x_pad2, y_pad1+1:input_height-y_pad2, :, :]
+    end
+
+    return input_gradient
+end
\ No newline at end of file
diff --git a/ext/NNlibLoopVectorizationExt/conv_old.jl b/ext/NNlibLoopVectorizationExt/conv_old.jl
new file mode 100644
index 000000000..18cc294b6
--- /dev/null
+++ b/ext/NNlibLoopVectorizationExt/conv_old.jl
@@ -0,0 +1,304 @@
+#=
+Implementation copied from here (Jonas Steinebach, MIT):
+https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_convolution.jl
+Could include bias & activation too, hence overload `conv_bias_act`,
+at the cost of needing gradient rules for that.
+=#
+
+function zero_pad_2d(input::AbstractArray{T, 4}, padding::NTuple{4, Int}) where {T <: Number}
+    width, height, channels, current_batch_size = size(input)
+    x_pad1, x_pad2, y_pad1, y_pad2 = padding
+    output_height, output_width = height + y_pad1 + y_pad2, width + x_pad1 + x_pad2
+    output = zeros(T, output_width, output_height, channels, current_batch_size)
+    output[x_pad1 + 1:output_width - x_pad2, y_pad1 + 1:output_height - y_pad2, :, :] = input
+
+    return output
+end
+
+function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
+    # println("myconv called")
+
+    size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
+    cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
+    NNlib.check_dims(size(input), size_weight_check_dims, size(output), cdims_check_dims)
+
+    if cdims.padding != (0, 0, 0, 0)
+        #=
+        invoke(NNlib.conv!, 
+            Tuple{AbstractArray{T,4},AbstractArray{T,4},AbstractArray{T,4},ConvDims}, 
+            output, input, weight, cdims; kw...)
+        =#
+        input = zero_pad_2d(input, cdims.padding)
+    end
+
+    output_width, output_height, _ = size(output)
+    input_width, input_height, in_channels, batches = size(input)
+    weight_width, weight_height, in_channels_weight, out_channels = size(weight)
+
+    if !NNlib.flipkernel(cdims)
+        weight = reverse(weight, dims=(1, 2))
+    end
+
+    groups = cdims.groupcount
+    x_stride, y_stride = cdims.stride
+    x_dilation, y_dilation = cdims.dilation
+    out_channels_per_group = out_channels ÷ groups
+
+    if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1)
+        # println("very specialized case for maximum performance")
+
+        @tturbo for index_batch in 1:batches
+            for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
+                value = zero(T)
+                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                    value += input[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
+                end
+                output[x_out, y_out, out_channel, index_batch] = value #+ bias[out_channel]
+            end
+        end
+
+    elseif groups == 1
+        # println("second specialized case for better performance")
+
+        @tturbo for index_batch in 1:batches
+            for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
+                m = y_out + (y_stride - 1) * (y_out - 1)
+                n = x_out + (x_stride - 1) * (x_out - 1)
+                value = zero(T)
+                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                    y_in = m + (y_w - 1) * y_dilation
+                    x_in = n + (x_w - 1) * x_dilation
+                    value += input[x_in, y_in, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
+                end
+                output[x_out, y_out, out_channel, index_batch] = value #+ bias[out_channel]
+            end
+        end
+
+    else 
+        # println("general case for any convolution")
+
+        @tturbo for index_batch in 1:batches
+            for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in 1:output_height, x_out in 1:output_width
+                m = y_out + (y_stride - 1) * (y_out - 1)
+                n = x_out + (x_stride - 1) * (x_out - 1)
+                out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
+                value = zero(T)
+                for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width
+                    y_in = m + (y_w - 1) * y_dilation
+                    x_in = n + (x_w - 1) * x_dilation
+                    in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
+                    value += input[x_in, y_in, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel]
+                end
+                output[x_out, y_out, out_channel, index_batch] = value #+ bias[out_channel]
+            end
+        end
+    end
+
+    return output
+end
+
+function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
+    # println("myconv data back called")
+    
+    size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
+    cdims_check_dims = DenseConvDims(size(input_gradient), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
+    NNlib.check_dims(size(input_gradient), size_weight_check_dims, size(output_gradient), cdims_check_dims)
+    
+    # storing all the necessary shapes
+    output_width, output_height, out_channels, current_batch_size = size(output_gradient)
+    weight_width, weight_height, in_channels_weight, out_channels = size(weight)
+    # because in the actual computation section, values are added, it's saver to reset the given input_gradient first
+    input_gradient .= zero(T)
+    # check if input_gradient must be padded 
+    if cdims.padding != (0, 0, 0, 0)
+        input_gradient_padded = zero_pad_2d(input_gradient, cdims.padding)
+    else
+        input_gradient_padded = input_gradient
+    end
+    # store the size of input after padding 
+    input_width, input_height, in_channels, current_batch_size = size(input_gradient_padded) # size after padding
+
+    if !NNlib.flipkernel(cdims)
+        weight = reverse(weight, dims=(1, 2))
+    end
+
+    groups = cdims.groupcount
+    x_stride, y_stride = cdims.stride
+    x_dilation, y_dilation = cdims.dilation
+    out_channels_per_group = out_channels ÷ groups
+    # actual computation
+    if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
+        # println("very specialized case for maximum performance")
+        Threads.@threads for index_batch in 1:current_batch_size
+            @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
+                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                    input_gradient_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
+                end
+            end
+        end
+    elseif groups == 1 # second specialized case for better performance
+        # println("second specialized case for better performance")
+        Threads.@threads for index_batch in 1:current_batch_size
+            @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
+                m = y_out + (y_stride - 1) * (y_out - 1)
+                n = x_out + (x_stride - 1) * (x_out - 1)
+                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                    y_in = m + (y_w - 1) * y_dilation
+                    x_in = n + (x_w - 1) * x_dilation
+                    input_gradient_padded[x_in, y_in, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
+                end
+            end
+        end
+    else # general case for any convolution 
+        # println("general case for any convolution")
+        Threads.@threads for index_batch in 1:current_batch_size
+            for out_channel_per_group in 1:out_channels_per_group # putting @turbo here may end up in wrong results
+                @turbo for group in 1:groups, y_out in 1:output_height, x_out in 1:output_width
+                    m = y_out + (y_stride - 1) * (y_out - 1)
+                    n = x_out + (x_stride - 1) * (x_out - 1)
+                    out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
+                    for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width
+                        y_in = m + (y_w - 1) * y_dilation
+                        x_in = n + (x_w - 1) * x_dilation
+                        in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
+                        input_gradient_padded[x_in, y_in, in_channel_input, index_batch] += weight[x_w, y_w, in_channel_weight, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
+                    end
+                end
+            end
+        end
+    end
+
+    # depad 
+    if cdims.padding != (0, 0, 0, 0)
+        x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding
+        input_gradient .= input_gradient_padded[x_pad1+1:input_width-x_pad2, y_pad1+1:input_height-y_pad2, :, :]
+    end
+
+    return input_gradient
+end
+
+#=
+function NNlib.∇conv_filter!(weight_gradient::Array{T,4}, input::Array{T,4}, output_gradient::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
+    # println("myconv filter back called")
+    
+    size_weight_check_dims = (size(weight_gradient)[1:2]..., size(weight_gradient)[3]*cdims.groupcount, size(weight_gradient)[4])
+    cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
+    NNlib.check_dims(size(input), size_weight_check_dims, size(output_gradient), cdims_check_dims)
+    
+    # storing all the necessary shapes
+    input_width, input_height, in_channels, current_batch_size = size(input)
+    output_width, output_height, out_channels, current_batch_size = size(output_gradient)
+    weight_width, weight_height, in_channels_weight, out_channels = size(weight_gradient)
+
+    # check if input must be padded 
+    if cdims.padding != (0, 0, 0, 0)
+        input_padded = zero_pad_2d(input, cdims.padding)
+    else
+        input_padded = input
+    end
+
+    groups = cdims.groupcount
+    x_stride, y_stride = cdims.stride
+    x_dilation, y_dilation = cdims.dilation
+    out_channels_per_group = out_channels ÷ groups
+    # actual computation 
+    if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
+        # println("very specialized case for maximum performance")
+        #=
+        @tturbo for out_channel in 1:out_channels
+            for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                value = zero(T)
+                for index_batch in 1:current_batch_size, y_out in 1:output_height, x_out in 1:output_width
+                    value += input_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch]
+                end
+                weight_gradient[x_w, y_w, in_channel, out_channel] = value
+            end
+        end
+        =#
+        weight_gradient_batched = zeros(T, weight_width, weight_height, in_channels_weight, out_channels, current_batch_size)
+        @tturbo for index_batch in 1:current_batch_size # Threads.@threads
+            for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width # @turbo
+                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                    weight_gradient_batched[x_w, y_w, in_channel, out_channel, index_batch] += input_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch]
+                end
+            end
+        end
+        # weight_gradient .= @time dropdims(sum(weight_gradient_batched, dims=5), dims=5)
+        weight_gradient .= dropdims(sum(weight_gradient_batched, dims=5), dims=5)
+    elseif groups == 1 # second specialized case for better performance
+        # println("second specialized case for better performance")
+        #=
+        @tturbo for out_channel in 1:out_channels
+            for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                value = zero(T)
+                for index_batch in 1:current_batch_size, y_out in 1:output_height, x_out in 1:output_width
+                    m = y_out + (y_stride - 1) * (y_out - 1)
+                    n = x_out + (x_stride - 1) * (x_out - 1)
+                    y_in = m + (y_w - 1) * y_dilation
+                    x_in = n + (x_w - 1) * x_dilation
+                    value += input_padded[x_in, y_in, in_channel, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch]
+                end
+                weight_gradient[x_w, y_w, in_channel, out_channel] = value
+            end
+        end
+        =#
+        weight_gradient_batched = zeros(T, weight_width, weight_height, in_channels_weight, out_channels, current_batch_size)
+        @tturbo for index_batch in 1:current_batch_size # Threads.@threads
+            for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width # @turbo 
+                m = y_out + (y_stride - 1) * (y_out - 1)
+                n = x_out + (x_stride - 1) * (x_out - 1)
+                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                    y_in = m + (y_w - 1) * y_dilation
+                    x_in = n + (x_w - 1) * x_dilation
+                    weight_gradient_batched[x_w, y_w, in_channel, out_channel, index_batch] += input_padded[x_in, y_in, in_channel, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch]
+                end
+            end
+        end
+        # weight_gradient .= @time dropdims(sum(weight_gradient_batched, dims=5), dims=5)
+        weight_gradient .= dropdims(sum(weight_gradient_batched, dims=5), dims=5)
+    else # general case for any convolution 
+        # println("general case for any convolution")
+        #=
+        @tturbo for out_channel_per_group in 1:out_channels_per_group
+            for group in 1:groups, in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width
+                value = zero(T)
+                for index_batch in 1:current_batch_size, y_out in 1:output_height, x_out in 1:output_width
+                    m = y_out + (y_stride - 1) * (y_out - 1)
+                    n = x_out + (x_stride - 1) * (x_out - 1)
+                    out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
+                    y_in = m + (y_w - 1) * y_dilation
+                    x_in = n + (x_w - 1) * x_dilation
+                    in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
+                    value += input_padded[x_in, y_in, in_channel_input, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch]
+                end
+                weight_gradient[x_w, y_w, in_channel_weight, out_channel] = value
+            end
+        end
+        =#
+        weight_gradient_batched = zeros(T, weight_width, weight_height, in_channels_weight, out_channels, current_batch_size)
+        @tturbo for index_batch in 1:current_batch_size # Threads.@threads
+            for out_channel_per_group in 1:out_channels_per_group # @turbo 
+                for group in 1:groups, y_out in 1:output_height, x_out in 1:output_width
+                    m = y_out + (y_stride - 1) * (y_out - 1)
+                    n = x_out + (x_stride - 1) * (x_out - 1)
+                    out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
+                    for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width
+                        y_in = m + (y_w - 1) * y_dilation
+                        x_in = n + (x_w - 1) * x_dilation
+                        in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
+                        weight_gradient_batched[x_w, y_w, in_channel_weight, out_channel, index_batch] += input_padded[x_in, y_in, in_channel_input, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch]
+                    end
+                end
+            end
+        end
+        # weight_gradient .= @time dropdims(sum(weight_gradient_batched, dims=5), dims=5)
+        weight_gradient .= dropdims(sum(weight_gradient_batched, dims=5), dims=5)
+    end
+
+    if !NNlib.flipkernel(cdims)
+        weight_gradient = reverse(weight_gradient, dims=(1, 2))
+    end
+    
+    return weight_gradient
+end
+=#
\ No newline at end of file
diff --git a/ext/NNlibLoopVectorizationExt/pooling.jl b/ext/NNlibLoopVectorizationExt/pooling.jl
new file mode 100644
index 000000000..dc948335b
--- /dev/null
+++ b/ext/NNlibLoopVectorizationExt/pooling.jl
@@ -0,0 +1,109 @@
+#=
+Accelerated mean pooling for 2d-images using the power of LoopVectorization.
+The speed up is usually lower compared to conv but can be approximately up to 2x.
+
+Since the current state of LoopVectorization ∇meanpool! isn't really faster than the 
+original implementation in some situations, it is left out for the moment.
+
+Implementation inspired from here (Jonas Steinebach, MIT):
+https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_pooling.jl
+=#
+
+function NNlib.meanpool!(output::Array{T,4}, input::Array{T,4}, pdims::PoolDims; kw...) where {T<:Real}
+    NNlib.check_dims(size(input), size(output), pdims)
+    
+    # storing all the necessary shapes
+    input_width, input_height, channels, current_batch_size = size(input)
+    output_width, output_height, channels, current_batch_size = size(output)
+    kernel_width, kernel_height = pdims.kernel_size
+
+    x_stride, y_stride = pdims.stride
+    x_dilation, y_dilation = pdims.dilation
+    x_pad1, x_pad2, y_pad1, y_pad2 = pdims.padding
+
+    # A helper function to project from output (w, h) to input (input_w, input_h)
+    @inline project(idx, stride, pad) = (idx - 1) * stride - pad + 1
+
+    # We use calc_padding_regions to split outselves up into separate regions that may or
+    # may not need to worry about padding:
+    pdims_3d = PoolDims((input_width, input_height, 1, channels, current_batch_size), (kernel_width, kernel_height, 1), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1))
+    # println(pdims_3d.padding)
+    padded_regions, central_region = NNlib.calc_padding_regions(pdims_3d)
+
+    # We represent division by kernel size by rolling it
+    # into the `alpha` multiplier. 
+    _alpha = T(1 / prod(pdims.kernel_size))
+
+    # Start with the central region
+    w_region, h_region, _ = central_region
+
+    if pdims.stride == (1, 1) && pdims.dilation == (1, 1) # specialized case for better performance
+        # println("specialized case for better performance")
+
+        @tturbo for index_batch in 1:current_batch_size
+            # compute pooling for each channel separatly
+            for channel in 1:channels, y_out in h_region, x_out in w_region
+                kernel_sum = zero(T)
+                for y_w in 1:kernel_height, x_w in 1:kernel_width
+                    # kernel_sum += input[x_out + x_w - 1, y_out + y_w - 1, channel, index_batch]
+                    kernel_sum += input[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, channel, index_batch]
+                end
+                output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha
+            end
+        end
+
+    else # general case for any meanpooling
+        # println("general case for any meanpooling")
+
+        @tturbo for index_batch in 1:current_batch_size
+            # compute pooling for each channel separatly
+            for channel in 1:channels, y_out in h_region, x_out in w_region
+                m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1
+                n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1
+                kernel_sum = zero(T)
+                for y_w in 1:kernel_height, x_w in 1:kernel_width
+                    y_in = m + (y_w - 1) * y_dilation # - y_pad1
+                    x_in = n + (x_w - 1) * x_dilation # - x_pad1
+                    kernel_sum += input[x_in, y_in, channel, index_batch]
+                end
+                output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha
+            end
+        end
+
+    end 
+
+    # Next, the padded regions
+    @inbounds for (w_region, h_region, d_region) in padded_regions
+        for index_batch in 1:current_batch_size, channel in 1:channels
+            for d in d_region # for skipping the d_regions
+            for h in h_region
+            ph = project(h, y_stride, y_pad1)
+            for w in w_region
+            pw = project(w, x_stride, x_pad1)
+            m = zero(T)
+
+                for kh in 1:kernel_height
+                    input_kh = ph + (kh - 1) * y_dilation
+                    if input_kh <= 0 || input_kh > input_height
+                        continue
+                    end
+
+                    for kw in 1:kernel_width
+                        input_kw = pw + (kw - 1) * x_dilation
+                        if input_kw <= 0 || input_kw > input_width
+                            continue
+                        end
+
+                        m += input[input_kw, input_kh, channel, index_batch]
+                    end
+                end
+
+            output[w, h, channel, index_batch] = _alpha * m
+            end
+            end
+            end
+        end
+    end
+
+    return output
+end
\ No newline at end of file
diff --git a/ext/NNlibLoopVectorizationExt/pooling_old.jl b/ext/NNlibLoopVectorizationExt/pooling_old.jl
new file mode 100644
index 000000000..dc8fe34d5
--- /dev/null
+++ b/ext/NNlibLoopVectorizationExt/pooling_old.jl
@@ -0,0 +1,201 @@
+#=
+Accelerated mean pooling for 2d-images using the power of LoopVectorization.
+The speed up is lower compared to conv but can be approximately up to 2x.
+
+Since the current state of LoopVectorization ∇meanpool! isn't really faster than the 
+original implementation in some situations, it is left out for the moment.
+
+Implementation inspired from here (Jonas Steinebach, MIT):
+https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_pooling.jl
+=#
+
+function NNlib.meanpool!(output::Array{T,4}, input::Array{T,4}, pdims::PoolDims; kw...) where {T<:Real}
+    # storing all the necessary shapes
+    input_width, input_height, channels, current_batch_size = size(input)
+    output_width, output_height, channels, current_batch_size = size(output)
+    kernel_width, kernel_height = pdims.kernel_size
+
+    x_stride, y_stride = pdims.stride
+    x_dilation, y_dilation = pdims.dilation
+    x_pad1, x_pad2, y_pad1, y_pad2 = pdims.padding
+
+    # A helper function to project from output (w, h) to input (input_w, input_h)
+    @inline project(idx, stride, pad) = (idx - 1) * stride - pad + 1
+
+    # We use calc_padding_regions to split outselves up into separate regions that may or
+    # may not need to worry about padding:
+    pdims_3d = PoolDims((input_width, input_height, 1, channels, current_batch_size), (kernel_width, kernel_height, 1), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1))
+    # println(pdims_3d.padding)
+    padded_regions, central_region = NNlib.calc_padding_regions(pdims_3d)
+
+    # We represent division by kernel size by rolling it
+    # into the `alpha` multiplier. 
+    _alpha = T(1 / prod(pdims.kernel_size))
+
+    # Start with the central region
+    w_region, h_region, _ = central_region
+
+    if pdims.stride == (1, 1) && pdims.dilation == (1, 1) # specialized case for better performance
+
+        @tturbo for index_batch in 1:current_batch_size
+            # compute pooling for each channel separatly
+            for channel in 1:channels, y_out in h_region, x_out in w_region
+                kernel_sum = zero(T)
+                for y_w in 1:kernel_height, x_w in 1:kernel_width
+                    # kernel_sum += input[x_out + x_w - 1, y_out + y_w - 1, channel, index_batch]
+                    kernel_sum += input[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, channel, index_batch]
+                end
+                output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha
+            end
+        end
+
+    else # general case for any meanpooling
+
+        @tturbo for index_batch in 1:current_batch_size
+            # compute pooling for each channel separatly
+            for channel in 1:channels, y_out in h_region, x_out in w_region
+                m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1
+                n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1
+                kernel_sum = zero(T)
+                for y_w in 1:kernel_height, x_w in 1:kernel_width
+                    y_in = m + (y_w - 1) * y_dilation # - y_pad1
+                    x_in = n + (x_w - 1) * x_dilation # - x_pad1
+                    kernel_sum += input[x_in, y_in, channel, index_batch]
+                end
+                output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha
+            end
+        end
+
+    end 
+
+    # Next, the padded regions
+    @inbounds for (w_region, h_region, d_region) in padded_regions
+        for index_batch in 1:current_batch_size, channel in 1:channels
+            for d in d_region # for skipping the d_regions
+            for h in h_region
+            ph = project(h, y_stride, y_pad1)
+            for w in w_region
+            pw = project(w, x_stride, x_pad1)
+            m = zero(T)
+
+                for kh in 1:kernel_height
+                    input_kh = ph + (kh - 1) * y_dilation
+                    if input_kh <= 0 || input_kh > input_height
+                        continue
+                    end
+
+                    for kw in 1:kernel_width
+                        input_kw = pw + (kw - 1) * x_dilation
+                        if input_kw <= 0 || input_kw > input_width
+                            continue
+                        end
+
+                        m += input[input_kw, input_kh, channel, index_batch]
+                    end
+                end
+
+            output[w, h, channel, index_batch] = _alpha * m
+            end
+            end
+            end
+        end
+    end
+
+    return output
+end
+
+function NNlib.∇meanpool!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, output::Array{T,4}, input::Array{T,4}, pdims::PoolDims; kw...) where {T<:Real}
+    # storing all the necessary shapes
+    input_width, input_height, channels, current_batch_size = size(input_gradient)
+    output_width, output_height, channels, current_batch_size = size(output_gradient)
+    kernel_width, kernel_height = pdims.kernel_size
+
+    # because in the actual computation section, values are added, it's saver to reset the given input_gradient first
+    input_gradient .= zero(T)
+
+    x_stride, y_stride = pdims.stride
+    x_dilation, y_dilation = pdims.dilation
+    x_pad1, x_pad2, y_pad1, y_pad2 = pdims.padding
+
+    # A helper function to project from output (w, h) to input (input_w, input_h)
+    @inline project(idx, stride, pad) = (idx - 1) * stride - pad + 1
+
+    # We use calc_padding_regions to split outselves up into separate regions that may or
+    # may not need to worry about padding:
+    pdims_3d = PoolDims((input_width, input_height, 1, channels, current_batch_size), (kernel_width, kernel_height, 1), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1))
+    # println(pdims_3d.padding)
+    padded_regions, central_region = NNlib.calc_padding_regions(pdims_3d)
+
+    # We represent division by kernel size by rolling it
+    # into the `alpha` multiplier. 
+    _alpha = T(1 / prod(pdims.kernel_size))
+
+    # Start with the central region
+    w_region, h_region, _ = central_region
+
+    if pdims.stride == (1, 1) && pdims.dilation == (1, 1) # specialized case for better performance
+
+        @tturbo for index_batch in 1:current_batch_size
+            # compute pooling for each channel separatly
+            for channel in 1:channels, y_out in h_region, x_out in w_region
+                output_gradient_value = output_gradient[x_out, y_out, channel, index_batch]
+                for y_w in 1:kernel_height, x_w in 1:kernel_width
+                    input_gradient[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, channel, index_batch] += output_gradient_value * _alpha
+                end
+            end
+        end
+
+    else # general case for any meanpooling
+
+        @tturbo for index_batch in 1:current_batch_size
+            # compute pooling for each channel separatly
+            for channel in 1:channels, y_out in h_region, x_out in w_region
+                output_gradient_value = output_gradient[x_out, y_out, channel, index_batch]
+                m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1
+                n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1
+                for y_w in 1:kernel_height, x_w in 1:kernel_width
+                    y_in = m + (y_w - 1) * y_dilation # - y_pad1
+                    x_in = n + (x_w - 1) * x_dilation # - x_pad1
+                    input_gradient[x_in, y_in, channel, index_batch] += output_gradient_value * _alpha
+                end
+            end
+        end
+
+    end
+
+    # Next, the padded regions
+    @inbounds for (w_region, h_region, d_region) in padded_regions
+        for index_batch in 1:current_batch_size, channel in 1:channels
+            for d in d_region # for skipping the d_regions
+            for h in h_region
+            ph = project(h, y_stride, y_pad1)
+            for w in w_region
+            pw = project(w, x_stride, x_pad1)
+
+                # Grab the incoming gradient at this index for future use
+                output_gradient_value = output_gradient[w, h, channel, index_batch]
+
+                for kh in 1:kernel_height
+                    input_kh = ph + (kh - 1) * y_dilation
+                    if input_kh <= 0 || input_kh > input_height
+                        continue
+                    end
+
+                    for kw in 1:kernel_width
+                        input_kw = pw + (kw - 1) * x_dilation
+                        if input_kw <= 0 || input_kw > input_width
+                            continue
+                        end
+
+                        input_gradient[input_kw, input_kh, channel, index_batch] += output_gradient_value * _alpha
+                    end
+                end
+
+            end
+            end
+            end
+        end
+    end
+
+    return input_gradient
+end
\ No newline at end of file

From 2a3cf3eecf892c0fba0904567d678813f1587766 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Tue, 26 Sep 2023 20:55:50 +0200
Subject: [PATCH 02/37] Delete ext/NNlibLoopVectorizationExt/conv_old.jl

---
 ext/NNlibLoopVectorizationExt/conv_old.jl | 304 ----------------------
 1 file changed, 304 deletions(-)
 delete mode 100644 ext/NNlibLoopVectorizationExt/conv_old.jl

diff --git a/ext/NNlibLoopVectorizationExt/conv_old.jl b/ext/NNlibLoopVectorizationExt/conv_old.jl
deleted file mode 100644
index 18cc294b6..000000000
--- a/ext/NNlibLoopVectorizationExt/conv_old.jl
+++ /dev/null
@@ -1,304 +0,0 @@
-#=
-Implementation copied from here (Jonas Steinebach, MIT):
-https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_convolution.jl
-Could include bias & activation too, hence overload `conv_bias_act`,
-at the cost of needing gradient rules for that.
-=#
-
-function zero_pad_2d(input::AbstractArray{T, 4}, padding::NTuple{4, Int}) where {T <: Number}
-    width, height, channels, current_batch_size = size(input)
-    x_pad1, x_pad2, y_pad1, y_pad2 = padding
-    output_height, output_width = height + y_pad1 + y_pad2, width + x_pad1 + x_pad2
-    output = zeros(T, output_width, output_height, channels, current_batch_size)
-    output[x_pad1 + 1:output_width - x_pad2, y_pad1 + 1:output_height - y_pad2, :, :] = input
-
-    return output
-end
-
-function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
-    # println("myconv called")
-
-    size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
-    cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
-    NNlib.check_dims(size(input), size_weight_check_dims, size(output), cdims_check_dims)
-
-    if cdims.padding != (0, 0, 0, 0)
-        #=
-        invoke(NNlib.conv!, 
-            Tuple{AbstractArray{T,4},AbstractArray{T,4},AbstractArray{T,4},ConvDims}, 
-            output, input, weight, cdims; kw...)
-        =#
-        input = zero_pad_2d(input, cdims.padding)
-    end
-
-    output_width, output_height, _ = size(output)
-    input_width, input_height, in_channels, batches = size(input)
-    weight_width, weight_height, in_channels_weight, out_channels = size(weight)
-
-    if !NNlib.flipkernel(cdims)
-        weight = reverse(weight, dims=(1, 2))
-    end
-
-    groups = cdims.groupcount
-    x_stride, y_stride = cdims.stride
-    x_dilation, y_dilation = cdims.dilation
-    out_channels_per_group = out_channels ÷ groups
-
-    if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1)
-        # println("very specialized case for maximum performance")
-
-        @tturbo for index_batch in 1:batches
-            for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
-                value = zero(T)
-                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
-                    value += input[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
-                end
-                output[x_out, y_out, out_channel, index_batch] = value #+ bias[out_channel]
-            end
-        end
-
-    elseif groups == 1
-        # println("second specialized case for better performance")
-
-        @tturbo for index_batch in 1:batches
-            for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
-                m = y_out + (y_stride - 1) * (y_out - 1)
-                n = x_out + (x_stride - 1) * (x_out - 1)
-                value = zero(T)
-                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
-                    y_in = m + (y_w - 1) * y_dilation
-                    x_in = n + (x_w - 1) * x_dilation
-                    value += input[x_in, y_in, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
-                end
-                output[x_out, y_out, out_channel, index_batch] = value #+ bias[out_channel]
-            end
-        end
-
-    else 
-        # println("general case for any convolution")
-
-        @tturbo for index_batch in 1:batches
-            for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in 1:output_height, x_out in 1:output_width
-                m = y_out + (y_stride - 1) * (y_out - 1)
-                n = x_out + (x_stride - 1) * (x_out - 1)
-                out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
-                value = zero(T)
-                for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width
-                    y_in = m + (y_w - 1) * y_dilation
-                    x_in = n + (x_w - 1) * x_dilation
-                    in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
-                    value += input[x_in, y_in, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel]
-                end
-                output[x_out, y_out, out_channel, index_batch] = value #+ bias[out_channel]
-            end
-        end
-    end
-
-    return output
-end
-
-function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
-    # println("myconv data back called")
-    
-    size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
-    cdims_check_dims = DenseConvDims(size(input_gradient), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
-    NNlib.check_dims(size(input_gradient), size_weight_check_dims, size(output_gradient), cdims_check_dims)
-    
-    # storing all the necessary shapes
-    output_width, output_height, out_channels, current_batch_size = size(output_gradient)
-    weight_width, weight_height, in_channels_weight, out_channels = size(weight)
-    # because in the actual computation section, values are added, it's saver to reset the given input_gradient first
-    input_gradient .= zero(T)
-    # check if input_gradient must be padded 
-    if cdims.padding != (0, 0, 0, 0)
-        input_gradient_padded = zero_pad_2d(input_gradient, cdims.padding)
-    else
-        input_gradient_padded = input_gradient
-    end
-    # store the size of input after padding 
-    input_width, input_height, in_channels, current_batch_size = size(input_gradient_padded) # size after padding
-
-    if !NNlib.flipkernel(cdims)
-        weight = reverse(weight, dims=(1, 2))
-    end
-
-    groups = cdims.groupcount
-    x_stride, y_stride = cdims.stride
-    x_dilation, y_dilation = cdims.dilation
-    out_channels_per_group = out_channels ÷ groups
-    # actual computation
-    if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
-        # println("very specialized case for maximum performance")
-        Threads.@threads for index_batch in 1:current_batch_size
-            @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
-                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
-                    input_gradient_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
-                end
-            end
-        end
-    elseif groups == 1 # second specialized case for better performance
-        # println("second specialized case for better performance")
-        Threads.@threads for index_batch in 1:current_batch_size
-            @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
-                m = y_out + (y_stride - 1) * (y_out - 1)
-                n = x_out + (x_stride - 1) * (x_out - 1)
-                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
-                    y_in = m + (y_w - 1) * y_dilation
-                    x_in = n + (x_w - 1) * x_dilation
-                    input_gradient_padded[x_in, y_in, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
-                end
-            end
-        end
-    else # general case for any convolution 
-        # println("general case for any convolution")
-        Threads.@threads for index_batch in 1:current_batch_size
-            for out_channel_per_group in 1:out_channels_per_group # putting @turbo here may end up in wrong results
-                @turbo for group in 1:groups, y_out in 1:output_height, x_out in 1:output_width
-                    m = y_out + (y_stride - 1) * (y_out - 1)
-                    n = x_out + (x_stride - 1) * (x_out - 1)
-                    out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
-                    for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width
-                        y_in = m + (y_w - 1) * y_dilation
-                        x_in = n + (x_w - 1) * x_dilation
-                        in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
-                        input_gradient_padded[x_in, y_in, in_channel_input, index_batch] += weight[x_w, y_w, in_channel_weight, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
-                    end
-                end
-            end
-        end
-    end
-
-    # depad 
-    if cdims.padding != (0, 0, 0, 0)
-        x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding
-        input_gradient .= input_gradient_padded[x_pad1+1:input_width-x_pad2, y_pad1+1:input_height-y_pad2, :, :]
-    end
-
-    return input_gradient
-end
-
-#=
-function NNlib.∇conv_filter!(weight_gradient::Array{T,4}, input::Array{T,4}, output_gradient::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
-    # println("myconv filter back called")
-    
-    size_weight_check_dims = (size(weight_gradient)[1:2]..., size(weight_gradient)[3]*cdims.groupcount, size(weight_gradient)[4])
-    cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
-    NNlib.check_dims(size(input), size_weight_check_dims, size(output_gradient), cdims_check_dims)
-    
-    # storing all the necessary shapes
-    input_width, input_height, in_channels, current_batch_size = size(input)
-    output_width, output_height, out_channels, current_batch_size = size(output_gradient)
-    weight_width, weight_height, in_channels_weight, out_channels = size(weight_gradient)
-
-    # check if input must be padded 
-    if cdims.padding != (0, 0, 0, 0)
-        input_padded = zero_pad_2d(input, cdims.padding)
-    else
-        input_padded = input
-    end
-
-    groups = cdims.groupcount
-    x_stride, y_stride = cdims.stride
-    x_dilation, y_dilation = cdims.dilation
-    out_channels_per_group = out_channels ÷ groups
-    # actual computation 
-    if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
-        # println("very specialized case for maximum performance")
-        #=
-        @tturbo for out_channel in 1:out_channels
-            for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
-                value = zero(T)
-                for index_batch in 1:current_batch_size, y_out in 1:output_height, x_out in 1:output_width
-                    value += input_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch]
-                end
-                weight_gradient[x_w, y_w, in_channel, out_channel] = value
-            end
-        end
-        =#
-        weight_gradient_batched = zeros(T, weight_width, weight_height, in_channels_weight, out_channels, current_batch_size)
-        @tturbo for index_batch in 1:current_batch_size # Threads.@threads
-            for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width # @turbo
-                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
-                    weight_gradient_batched[x_w, y_w, in_channel, out_channel, index_batch] += input_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch]
-                end
-            end
-        end
-        # weight_gradient .= @time dropdims(sum(weight_gradient_batched, dims=5), dims=5)
-        weight_gradient .= dropdims(sum(weight_gradient_batched, dims=5), dims=5)
-    elseif groups == 1 # second specialized case for better performance
-        # println("second specialized case for better performance")
-        #=
-        @tturbo for out_channel in 1:out_channels
-            for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
-                value = zero(T)
-                for index_batch in 1:current_batch_size, y_out in 1:output_height, x_out in 1:output_width
-                    m = y_out + (y_stride - 1) * (y_out - 1)
-                    n = x_out + (x_stride - 1) * (x_out - 1)
-                    y_in = m + (y_w - 1) * y_dilation
-                    x_in = n + (x_w - 1) * x_dilation
-                    value += input_padded[x_in, y_in, in_channel, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch]
-                end
-                weight_gradient[x_w, y_w, in_channel, out_channel] = value
-            end
-        end
-        =#
-        weight_gradient_batched = zeros(T, weight_width, weight_height, in_channels_weight, out_channels, current_batch_size)
-        @tturbo for index_batch in 1:current_batch_size # Threads.@threads
-            for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width # @turbo 
-                m = y_out + (y_stride - 1) * (y_out - 1)
-                n = x_out + (x_stride - 1) * (x_out - 1)
-                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
-                    y_in = m + (y_w - 1) * y_dilation
-                    x_in = n + (x_w - 1) * x_dilation
-                    weight_gradient_batched[x_w, y_w, in_channel, out_channel, index_batch] += input_padded[x_in, y_in, in_channel, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch]
-                end
-            end
-        end
-        # weight_gradient .= @time dropdims(sum(weight_gradient_batched, dims=5), dims=5)
-        weight_gradient .= dropdims(sum(weight_gradient_batched, dims=5), dims=5)
-    else # general case for any convolution 
-        # println("general case for any convolution")
-        #=
-        @tturbo for out_channel_per_group in 1:out_channels_per_group
-            for group in 1:groups, in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width
-                value = zero(T)
-                for index_batch in 1:current_batch_size, y_out in 1:output_height, x_out in 1:output_width
-                    m = y_out + (y_stride - 1) * (y_out - 1)
-                    n = x_out + (x_stride - 1) * (x_out - 1)
-                    out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
-                    y_in = m + (y_w - 1) * y_dilation
-                    x_in = n + (x_w - 1) * x_dilation
-                    in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
-                    value += input_padded[x_in, y_in, in_channel_input, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch]
-                end
-                weight_gradient[x_w, y_w, in_channel_weight, out_channel] = value
-            end
-        end
-        =#
-        weight_gradient_batched = zeros(T, weight_width, weight_height, in_channels_weight, out_channels, current_batch_size)
-        @tturbo for index_batch in 1:current_batch_size # Threads.@threads
-            for out_channel_per_group in 1:out_channels_per_group # @turbo 
-                for group in 1:groups, y_out in 1:output_height, x_out in 1:output_width
-                    m = y_out + (y_stride - 1) * (y_out - 1)
-                    n = x_out + (x_stride - 1) * (x_out - 1)
-                    out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
-                    for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width
-                        y_in = m + (y_w - 1) * y_dilation
-                        x_in = n + (x_w - 1) * x_dilation
-                        in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
-                        weight_gradient_batched[x_w, y_w, in_channel_weight, out_channel, index_batch] += input_padded[x_in, y_in, in_channel_input, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch]
-                    end
-                end
-            end
-        end
-        # weight_gradient .= @time dropdims(sum(weight_gradient_batched, dims=5), dims=5)
-        weight_gradient .= dropdims(sum(weight_gradient_batched, dims=5), dims=5)
-    end
-
-    if !NNlib.flipkernel(cdims)
-        weight_gradient = reverse(weight_gradient, dims=(1, 2))
-    end
-    
-    return weight_gradient
-end
-=#
\ No newline at end of file

From 28a027a703b649532c641927ca085aa3c76dbca9 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Tue, 26 Sep 2023 20:56:01 +0200
Subject: [PATCH 03/37] Delete ext/NNlibLoopVectorizationExt/pooling_old.jl

---
 ext/NNlibLoopVectorizationExt/pooling_old.jl | 201 -------------------
 1 file changed, 201 deletions(-)
 delete mode 100644 ext/NNlibLoopVectorizationExt/pooling_old.jl

diff --git a/ext/NNlibLoopVectorizationExt/pooling_old.jl b/ext/NNlibLoopVectorizationExt/pooling_old.jl
deleted file mode 100644
index dc8fe34d5..000000000
--- a/ext/NNlibLoopVectorizationExt/pooling_old.jl
+++ /dev/null
@@ -1,201 +0,0 @@
-#=
-Accelerated mean pooling for 2d-images using the power of LoopVectorization.
-The speed up is lower compared to conv but can be approximately up to 2x.
-
-Since the current state of LoopVectorization ∇meanpool! isn't really faster than the 
-original implementation in some situations, it is left out for the moment.
-
-Implementation inspired from here (Jonas Steinebach, MIT):
-https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_pooling.jl
-=#
-
-function NNlib.meanpool!(output::Array{T,4}, input::Array{T,4}, pdims::PoolDims; kw...) where {T<:Real}
-    # storing all the necessary shapes
-    input_width, input_height, channels, current_batch_size = size(input)
-    output_width, output_height, channels, current_batch_size = size(output)
-    kernel_width, kernel_height = pdims.kernel_size
-
-    x_stride, y_stride = pdims.stride
-    x_dilation, y_dilation = pdims.dilation
-    x_pad1, x_pad2, y_pad1, y_pad2 = pdims.padding
-
-    # A helper function to project from output (w, h) to input (input_w, input_h)
-    @inline project(idx, stride, pad) = (idx - 1) * stride - pad + 1
-
-    # We use calc_padding_regions to split outselves up into separate regions that may or
-    # may not need to worry about padding:
-    pdims_3d = PoolDims((input_width, input_height, 1, channels, current_batch_size), (kernel_width, kernel_height, 1), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1))
-    # println(pdims_3d.padding)
-    padded_regions, central_region = NNlib.calc_padding_regions(pdims_3d)
-
-    # We represent division by kernel size by rolling it
-    # into the `alpha` multiplier. 
-    _alpha = T(1 / prod(pdims.kernel_size))
-
-    # Start with the central region
-    w_region, h_region, _ = central_region
-
-    if pdims.stride == (1, 1) && pdims.dilation == (1, 1) # specialized case for better performance
-
-        @tturbo for index_batch in 1:current_batch_size
-            # compute pooling for each channel separatly
-            for channel in 1:channels, y_out in h_region, x_out in w_region
-                kernel_sum = zero(T)
-                for y_w in 1:kernel_height, x_w in 1:kernel_width
-                    # kernel_sum += input[x_out + x_w - 1, y_out + y_w - 1, channel, index_batch]
-                    kernel_sum += input[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, channel, index_batch]
-                end
-                output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha
-            end
-        end
-
-    else # general case for any meanpooling
-
-        @tturbo for index_batch in 1:current_batch_size
-            # compute pooling for each channel separatly
-            for channel in 1:channels, y_out in h_region, x_out in w_region
-                m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1
-                n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1
-                kernel_sum = zero(T)
-                for y_w in 1:kernel_height, x_w in 1:kernel_width
-                    y_in = m + (y_w - 1) * y_dilation # - y_pad1
-                    x_in = n + (x_w - 1) * x_dilation # - x_pad1
-                    kernel_sum += input[x_in, y_in, channel, index_batch]
-                end
-                output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha
-            end
-        end
-
-    end 
-
-    # Next, the padded regions
-    @inbounds for (w_region, h_region, d_region) in padded_regions
-        for index_batch in 1:current_batch_size, channel in 1:channels
-            for d in d_region # for skipping the d_regions
-            for h in h_region
-            ph = project(h, y_stride, y_pad1)
-            for w in w_region
-            pw = project(w, x_stride, x_pad1)
-            m = zero(T)
-
-                for kh in 1:kernel_height
-                    input_kh = ph + (kh - 1) * y_dilation
-                    if input_kh <= 0 || input_kh > input_height
-                        continue
-                    end
-
-                    for kw in 1:kernel_width
-                        input_kw = pw + (kw - 1) * x_dilation
-                        if input_kw <= 0 || input_kw > input_width
-                            continue
-                        end
-
-                        m += input[input_kw, input_kh, channel, index_batch]
-                    end
-                end
-
-            output[w, h, channel, index_batch] = _alpha * m
-            end
-            end
-            end
-        end
-    end
-
-    return output
-end
-
-function NNlib.∇meanpool!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, output::Array{T,4}, input::Array{T,4}, pdims::PoolDims; kw...) where {T<:Real}
-    # storing all the necessary shapes
-    input_width, input_height, channels, current_batch_size = size(input_gradient)
-    output_width, output_height, channels, current_batch_size = size(output_gradient)
-    kernel_width, kernel_height = pdims.kernel_size
-
-    # because in the actual computation section, values are added, it's saver to reset the given input_gradient first
-    input_gradient .= zero(T)
-
-    x_stride, y_stride = pdims.stride
-    x_dilation, y_dilation = pdims.dilation
-    x_pad1, x_pad2, y_pad1, y_pad2 = pdims.padding
-
-    # A helper function to project from output (w, h) to input (input_w, input_h)
-    @inline project(idx, stride, pad) = (idx - 1) * stride - pad + 1
-
-    # We use calc_padding_regions to split outselves up into separate regions that may or
-    # may not need to worry about padding:
-    pdims_3d = PoolDims((input_width, input_height, 1, channels, current_batch_size), (kernel_width, kernel_height, 1), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1))
-    # println(pdims_3d.padding)
-    padded_regions, central_region = NNlib.calc_padding_regions(pdims_3d)
-
-    # We represent division by kernel size by rolling it
-    # into the `alpha` multiplier. 
-    _alpha = T(1 / prod(pdims.kernel_size))
-
-    # Start with the central region
-    w_region, h_region, _ = central_region
-
-    if pdims.stride == (1, 1) && pdims.dilation == (1, 1) # specialized case for better performance
-
-        @tturbo for index_batch in 1:current_batch_size
-            # compute pooling for each channel separatly
-            for channel in 1:channels, y_out in h_region, x_out in w_region
-                output_gradient_value = output_gradient[x_out, y_out, channel, index_batch]
-                for y_w in 1:kernel_height, x_w in 1:kernel_width
-                    input_gradient[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, channel, index_batch] += output_gradient_value * _alpha
-                end
-            end
-        end
-
-    else # general case for any meanpooling
-
-        @tturbo for index_batch in 1:current_batch_size
-            # compute pooling for each channel separatly
-            for channel in 1:channels, y_out in h_region, x_out in w_region
-                output_gradient_value = output_gradient[x_out, y_out, channel, index_batch]
-                m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1
-                n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1
-                for y_w in 1:kernel_height, x_w in 1:kernel_width
-                    y_in = m + (y_w - 1) * y_dilation # - y_pad1
-                    x_in = n + (x_w - 1) * x_dilation # - x_pad1
-                    input_gradient[x_in, y_in, channel, index_batch] += output_gradient_value * _alpha
-                end
-            end
-        end
-
-    end
-
-    # Next, the padded regions
-    @inbounds for (w_region, h_region, d_region) in padded_regions
-        for index_batch in 1:current_batch_size, channel in 1:channels
-            for d in d_region # for skipping the d_regions
-            for h in h_region
-            ph = project(h, y_stride, y_pad1)
-            for w in w_region
-            pw = project(w, x_stride, x_pad1)
-
-                # Grab the incoming gradient at this index for future use
-                output_gradient_value = output_gradient[w, h, channel, index_batch]
-
-                for kh in 1:kernel_height
-                    input_kh = ph + (kh - 1) * y_dilation
-                    if input_kh <= 0 || input_kh > input_height
-                        continue
-                    end
-
-                    for kw in 1:kernel_width
-                        input_kw = pw + (kw - 1) * x_dilation
-                        if input_kw <= 0 || input_kw > input_width
-                            continue
-                        end
-
-                        input_gradient[input_kw, input_kh, channel, index_batch] += output_gradient_value * _alpha
-                    end
-                end
-
-            end
-            end
-            end
-        end
-    end
-
-    return input_gradient
-end
\ No newline at end of file

From 5339aafa8ed56494c50a83a7767e95e7f7be79a7 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Tue, 26 Sep 2023 20:57:34 +0200
Subject: [PATCH 04/37] Add files via upload

---
 test/ext_loopvectorization/runtests.jl | 71 ++++++++++++++++++++++++++
 test/runtests.jl                       | 17 +++++-
 2 files changed, 87 insertions(+), 1 deletion(-)
 create mode 100644 test/ext_loopvectorization/runtests.jl

diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl
new file mode 100644
index 000000000..05157a8bc
--- /dev/null
+++ b/test/ext_loopvectorization/runtests.jl
@@ -0,0 +1,71 @@
+using NNlib, Test
+
+function compute_conv_outputs(settings::Vector{<:NamedTuple}, input::Array{T,4}, weight_ungrouped::Array{T,4}, weight_grouped::Array{T,4}) where {T<:Real}
+    conv_outs = Vector{Array{T, 4}}(undef, length(settings))
+    conv_grads = Vector{Array{T, 4}}(undef, length(settings))
+
+    for (i, setting) in enumerate(settings)
+        if setting.groups > 1
+            weight = weight_grouped
+        else
+            weight = weight_ungrouped
+        end
+
+        cdims = NNlib.DenseConvDims(size(input), size(weight), stride = setting.stride, padding = setting.padding, dilation = setting.dilation, groups = setting.groups)
+
+        out = NNlib.conv(input, weight, cdims)
+        output_gradient = ones(T, size(out))
+
+        conv_grads[i] = NNlib.∇conv_data(output_gradient, weight, cdims)
+        conv_outs[i] = out
+    end
+
+    return conv_outs, conv_grads
+end
+
+function compute_pool_outputs(settings::Vector{<:NamedTuple}, input::Array{T,4}) where {T<:Real}
+    pool_outs = Vector{Array{T, 4}}(undef, length(settings))
+
+    for (i, setting) in enumerate(settings)
+        pdims = NNlib.PoolDims(size(input), setting.kernel_size, stride = setting.stride, padding = setting.padding, dilation = setting.dilation)
+        pool_outs[i] = NNlib.meanpool(input, pdims)
+    end
+
+    return pool_outs
+end
+
+@testset "Convolution & Pooling" begin
+    
+    dtype = Float32
+    input = rand(dtype, 224, 224, 3, 64) # for conv & pool
+    weight_ungrouped = rand(dtype, 5, 5, 3, 27) # for conv
+    weight_grouped = rand(dtype, 5, 5, 1, 27) # for grouped conv
+    
+    conv_settings_list = [
+        (; stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1), # test 'very specialized case'
+        (; stride=(2, 1), padding=(2, 0), dilation=(2, 1), groups=1), # test 'second specialized case'
+        (; stride=(2, 1), padding=(2, 0), dilation=(2, 0), groups=3), # test 'general case'
+    ]
+
+    pool_settings_list = [
+        (; kernel_size=(5, 4), stride=(1, 1), padding=(0, 0), dilation=(1, 1)), # test 'specialized case'
+        (; kernel_size=(5, 4), stride=(2, 1), padding=(2, 0), dilation=(2, 1)), # test 'general case'
+    ]
+
+    # compute outputs before loading LoopVectorization
+
+    conv_outs_std, conv_grads_std = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped)
+    pool_outs_std = compute_pool_outputs(pool_settings_list, input)
+
+    using LoopVectorization # now load the NNlibLoopVectorizationExt
+
+    conv_outs_lv, conv_grads_lv = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped)
+    pool_outs_lv = compute_pool_outputs(pool_settings_list, input)
+
+    # validate conv
+    @test all(isapprox.(conv_outs_std, conv_outs_lv))
+    @test all(isapprox.(conv_grads_std, conv_grads_lv))
+    # validate pool
+    @test all(isapprox.(pool_outs_std, pool_outs_lv))
+
+end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 03602a40d..5266b5f44 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -15,7 +15,8 @@ DocMeta.setdocmeta!(NNlib, :DocTestSetup, :(using NNlib, UnicodePlots); recursiv
 
 # ENV["NNLIB_TEST_CUDA"] = "true" # uncomment to run CUDA tests
 # ENV["NNLIB_TEST_AMDGPU"] = "true" # uncomment to run AMDGPU tests
-# ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests
+# ENV["NNLIB_TEST_LOOPVECTORIZATION"] = "false" # uncomment to skip LoopVectorization tests
+ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests
 
 const rng = StableRNG(123)
 include("test_utils.jl")
@@ -155,4 +156,18 @@ end
     else
         @info "Skipping AMDGPU tests, set NNLIB_TEST_AMDGPU=true to run them."
     end
+
+    if get(ENV, "NNLIB_TEST_LOOPVECTORIZATION", "true") == "true"
+        @testset "CPU" begin
+            # Don't import LoopVectorization here! 
+            # Because the LV-impls are simply tested against NNlib's standard CPU impls,
+            # importing LoopVectorization here would load NNlibLoopVectorizationExt too early!
+            @testset "LoopVectorization" begin
+                include("ext_loopvectorization/runtests.jl") 
+            end
+        end
+    else
+        @info "Skipping LoopVectorization tests, set NNLIB_TEST_LOOPVECTORIZATION=true to run them."
+    end
+
 end

From 9e0dc6d681c545cb3ff30b747aa384bcba29b691 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Tue, 26 Sep 2023 20:57:47 +0200
Subject: [PATCH 05/37] Add files via upload

---
 Project.toml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index b33b90564..49ef5d07f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -18,11 +18,13 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 
 [extensions]
 NNlibAMDGPUExt = "AMDGPU"
 NNlibCUDAExt = "CUDA"
 NNlibCUDACUDNNExt = ["CUDA", "cuDNN"]
+NNlibLoopVectorizationExt = "LoopVectorization"
 
 [compat]
 AMDGPU = "0.5, 0.6"
@@ -35,6 +37,7 @@ GPUArraysCore = "0.1"
 KernelAbstractions = "0.9.2"
 Requires = "1.0"
 julia = "1.9"
+# LoopVectorization = "=0.12.146"
 
 [extras]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
@@ -44,6 +47,7 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
@@ -53,5 +57,5 @@ cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [targets]
 test = ["AMDGPU", "CUDA", "ChainRulesTestUtils", "Documenter", 
-        "FiniteDifferences", "ForwardDiff", "Logging", "ReverseDiff", 
-        "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN"]
+        "FiniteDifferences", "ForwardDiff", "Logging", "LoopVectorization",
+        "ReverseDiff", "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN"]

From dd0f0ed866a6d2ec12b865f4ccd7f0f1670d3db9 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Tue, 26 Sep 2023 20:58:58 +0200
Subject: [PATCH 06/37] Add files via upload

---
 test/runtests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 5266b5f44..2a44d8d3e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -16,7 +16,7 @@ DocMeta.setdocmeta!(NNlib, :DocTestSetup, :(using NNlib, UnicodePlots); recursiv
 # ENV["NNLIB_TEST_CUDA"] = "true" # uncomment to run CUDA tests
 # ENV["NNLIB_TEST_AMDGPU"] = "true" # uncomment to run AMDGPU tests
 # ENV["NNLIB_TEST_LOOPVECTORIZATION"] = "false" # uncomment to skip LoopVectorization tests
-ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests
+# ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests
 
 const rng = StableRNG(123)
 include("test_utils.jl")

From b341d1cd81a968fccd5739fa40bf6c85ea330e82 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Tue, 26 Sep 2023 23:16:41 +0200
Subject: [PATCH 07/37] Add files via upload

---
 runtests.jl | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 173 insertions(+)
 create mode 100644 runtests.jl

diff --git a/runtests.jl b/runtests.jl
new file mode 100644
index 000000000..5266b5f44
--- /dev/null
+++ b/runtests.jl
@@ -0,0 +1,173 @@
+using NNlib, Test, Statistics, Random
+using ChainRulesCore, ChainRulesTestUtils
+using Base.Broadcast: broadcasted
+import FiniteDifferences
+import ForwardDiff
+import Zygote
+using Zygote: gradient
+using StableRNGs
+using Documenter
+using Adapt
+using KernelAbstractions
+import ReverseDiff as RD        # used in `pooling.jl`
+
+DocMeta.setdocmeta!(NNlib, :DocTestSetup, :(using NNlib, UnicodePlots); recursive=true)
+
+# ENV["NNLIB_TEST_CUDA"] = "true" # uncomment to run CUDA tests
+# ENV["NNLIB_TEST_AMDGPU"] = "true" # uncomment to run AMDGPU tests
+# ENV["NNLIB_TEST_LOOPVECTORIZATION"] = "false" # uncomment to skip LoopVectorization tests
+ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests
+
+const rng = StableRNG(123)
+include("test_utils.jl")
+
+macro conditional_testset(name, skip_tests, expr)
+    esc(quote
+        @testset $name begin
+            if $name ∉ $skip_tests
+                $expr
+            else
+                @test_skip false
+            end
+        end
+    end)
+end
+
+cpu(x) = adapt(CPU(), x)
+
+include("gather.jl")
+include("scatter.jl")
+include("upsample.jl")
+
+function nnlib_testsuite(Backend; skip_tests = Set{String}())
+    @conditional_testset "Upsample" skip_tests begin
+        upsample_testsuite(Backend)
+    end
+    @conditional_testset "Gather" skip_tests begin
+        gather_testsuite(Backend)
+    end
+    @conditional_testset "Scatter" skip_tests begin
+        scatter_testsuite(Backend)
+    end
+end
+
+@testset verbose=true "NNlib.jl" begin
+
+    if get(ENV, "NNLIB_TEST_CPU", "true") == "true"
+        @testset "CPU" begin      
+            @testset "Doctests" begin
+                doctest(NNlib, manual=false)
+            end
+
+            nnlib_testsuite(CPU)
+
+            @testset "Activation Functions" begin
+                include("activations.jl")
+                include("bias_act.jl")
+            end
+
+            @testset "Attention" begin
+                include("attention.jl")
+            end
+
+            @testset "Batched Multiplication" begin
+                include("batchedmul.jl")
+            end
+
+            @testset "Convolution" begin
+                include("conv.jl")
+                include("conv_bias_act.jl")
+            end
+
+            @testset "CTC Loss" begin
+                include("ctc.jl")
+            end
+
+            @testset "Dropout" begin
+                include("dropout.jl")
+            end
+
+            @testset "Fold/Unfold" begin
+                include("fold.jl")
+            end
+
+            @testset "Inference" begin
+                include("inference.jl")
+            end
+
+            @testset "Pooling" begin
+                include("pooling.jl")
+            end
+
+            @testset "Padding" begin
+                include("padding.jl")
+            end
+
+            @testset "Softmax" begin
+                include("softmax.jl")
+            end
+
+            @testset "Utilities" begin
+                include("utils.jl")
+            end
+
+            @testset "Grid Sampling" begin
+                include("sampling.jl")
+            end
+
+            @testset "Functions" begin
+                include("functions.jl")
+            end
+        end
+    else
+        @info "Skipping CPU tests, set NNLIB_TEST_CPU=true to run them."
+    end
+
+    if get(ENV, "NNLIB_TEST_CUDA", "false") == "true"
+        using CUDA
+        if CUDA.functional()
+            @testset "CUDA" begin
+                nnlib_testsuite(CUDABackend; skip_tests=Set(("Scatter", "Gather")))
+
+                include("ext_cuda/runtests.jl")
+            end
+        else
+            @info "Insufficient version or CUDA not found; Skipping CUDA tests"
+        end
+    else
+        @info "Skipping CUDA tests, set NNLIB_TEST_CUDA=true to run them"
+    end 
+
+    if get(ENV, "NNLIB_TEST_AMDGPU", "false") == "true"
+        using AMDGPU
+        AMDGPU.versioninfo()
+        if AMDGPU.functional() && AMDGPU.functional(:MIOpen)
+            @show AMDGPU.MIOpen.version()
+            @testset "AMDGPU" begin
+                nnlib_testsuite(ROCBackend)
+                AMDGPU.synchronize(; blocking=false)
+
+                include("ext_amdgpu/runtests.jl")
+                AMDGPU.synchronize(; blocking=false)
+            end
+        else
+            @info "AMDGPU.jl package is not functional. Skipping AMDGPU tests."
+        end
+    else
+        @info "Skipping AMDGPU tests, set NNLIB_TEST_AMDGPU=true to run them."
+    end
+
+    if get(ENV, "NNLIB_TEST_LOOPVECTORIZATION", "true") == "true"
+        @testset "CPU" begin
+            # Don't import LoopVectorization here! 
+            # Because the LV-impls are simply tested against NNlib's standard CPU impls,
+            # importing LoopVectorization here would load NNlibLoopVectorizationExt too early!
+            @testset "LoopVectorization" begin
+                include("ext_loopvectorization/runtests.jl") 
+            end
+        end
+    else
+        @info "Skipping LoopVectorization tests, set NNLIB_TEST_LOOPVECTORIZATION=true to run them."
+    end
+
+end

From 94f79642a2271ecb6c83a544e13a7aeb9245468c Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Tue, 26 Sep 2023 23:26:30 +0200
Subject: [PATCH 08/37] Update runtests.jl

---
 test/runtests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 2a44d8d3e..5266b5f44 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -16,7 +16,7 @@ DocMeta.setdocmeta!(NNlib, :DocTestSetup, :(using NNlib, UnicodePlots); recursiv
 # ENV["NNLIB_TEST_CUDA"] = "true" # uncomment to run CUDA tests
 # ENV["NNLIB_TEST_AMDGPU"] = "true" # uncomment to run AMDGPU tests
 # ENV["NNLIB_TEST_LOOPVECTORIZATION"] = "false" # uncomment to skip LoopVectorization tests
-# ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests
+ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests
 
 const rng = StableRNG(123)
 include("test_utils.jl")

From 6cc2e7516a5ea2b1046ba9b3a9f9bd4de8e6306c Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Wed, 27 Sep 2023 19:22:04 +0200
Subject: [PATCH 09/37] Add files via upload

---
 test/ext_loopvectorization/runtests.jl | 39 ++++++++++++++------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl
index 05157a8bc..9c4d82b75 100644
--- a/test/ext_loopvectorization/runtests.jl
+++ b/test/ext_loopvectorization/runtests.jl
@@ -1,34 +1,32 @@
-using NNlib, Test
+using NNlib, Test, BenchmarkTools
 
-function compute_conv_outputs(settings::Vector{<:NamedTuple}, input::Array{T,4}, weight_ungrouped::Array{T,4}, weight_grouped::Array{T,4}) where {T<:Real}
+function compute_conv_outputs(settings::Vector{<:NNlib.ConvDims}, input::Array{T,4}, weight_ungrouped::Array{T,4}, weight_grouped::Array{T,4}, conv_output_grads::Vector{Array{T,4}}) where {T<:Real}
     conv_outs = Vector{Array{T, 4}}(undef, length(settings))
     conv_grads = Vector{Array{T, 4}}(undef, length(settings))
 
     for (i, setting) in enumerate(settings)
-        if setting.groups > 1
+        if setting.groupcount > 1
             weight = weight_grouped
         else
             weight = weight_ungrouped
         end
 
-        cdims = NNlib.DenseConvDims(size(input), size(weight), stride = setting.stride, padding = setting.padding, dilation = setting.dilation, groups = setting.groups)
+        out = @btime NNlib.conv($input, $weight, $setting)
+        output_gradient = conv_output_grads[i]
 
-        out = NNlib.conv(input, weight, cdims)
-        output_gradient = ones(T, size(out))
-
-        conv_grads[i] = NNlib.∇conv_data(output_gradient, weight, cdims)
+        conv_grads[i] = @btime NNlib.∇conv_data($output_gradient, $weight, $setting)
         conv_outs[i] = out
     end
 
     return conv_outs, conv_grads
 end
 
-function compute_pool_outputs(settings::Vector{<:NamedTuple}, input::Array{T,4}) where {T<:Real}
+function compute_pool_outputs(settings::Vector{<:NNlib.PoolDims}, input::Array{T,4}) where {T<:Real}
     pool_outs = Vector{Array{T, 4}}(undef, length(settings))
 
     for (i, setting) in enumerate(settings)
         pdims = NNlib.PoolDims(size(input), setting.kernel_size, stride = setting.stride, padding = setting.padding, dilation = setting.dilation)
-        pool_outs[i] = NNlib.meanpool(input, pdims)
+        pool_outs[i] = @btime NNlib.meanpool($input, $pdims)
     end
 
     return pool_outs
@@ -37,29 +35,34 @@ end
 @testset "Convolution & Pooling" begin
     
     dtype = Float32
-    input = rand(dtype, 224, 224, 3, 64) # for conv & pool
+    batch_size = 64
+    input = rand(dtype, 224, 224, 3, batch_size) # for conv & pool
     weight_ungrouped = rand(dtype, 5, 5, 3, 27) # for conv
     weight_grouped = rand(dtype, 5, 5, 1, 27) # for grouped conv
     
     conv_settings_list = [
-        (; stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1), # test 'very specialized case'
-        (; stride=(2, 1), padding=(2, 0), dilation=(2, 1), groups=1), # test 'second specialized case'
-        (; stride=(2, 1), padding=(2, 0), dilation=(2, 0), groups=3), # test 'general case'
+        NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1), # test 'very specialized case'
+        NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(2, 1), padding=(0, 0), dilation=(2, 1), groups=1), # test 'second specialized case'
+        NNlib.DenseConvDims(size(input), size(weight_grouped), stride=(2, 1), padding=(2, 0), dilation=(2, 1), groups=3), # test 'general case'
     ]
 
+    conv_output_grads = [rand(dtype, NNlib.output_size(setting)..., 27, batch_size) for setting in conv_settings_list]
+
     pool_settings_list = [
-        (; kernel_size=(5, 4), stride=(1, 1), padding=(0, 0), dilation=(1, 1)), # test 'specialized case'
-        (; kernel_size=(5, 4), stride=(2, 1), padding=(2, 0), dilation=(2, 1)), # test 'general case'
+        NNlib.PoolDims(size(input), (5, 4), stride=(1, 1), padding=(0, 0), dilation=(1, 1)), # test 'specialized case'
+        NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 0), dilation=(2, 1)), # test 'general case'
     ]
 
     # compute outputs before loading LoopVectorization
 
-    conv_outs_std, conv_grads_std = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped)
+    println("without LoopVectorization")
+    conv_outs_std, conv_grads_std = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads)
     pool_outs_std = compute_pool_outputs(pool_settings_list, input)
 
     using LoopVectorization # now load the NNlibLoopVectorizationExt
 
-    conv_outs_lv, conv_grads_lv = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped)
+    println("with LoopVectorization")
+    conv_outs_lv, conv_grads_lv = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads)
     pool_outs_lv = compute_pool_outputs(pool_settings_list, input)
 
     # validate conv

From c5c79ee71f6c7c9b58c3a339c219557ce3ee3c90 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Wed, 27 Sep 2023 19:28:37 +0200
Subject: [PATCH 10/37] Add files via upload

---
 Project.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 49ef5d07f..e1794c43f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -41,6 +41,7 @@ julia = "1.9"
 
 [extras]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
@@ -56,6 +57,6 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [targets]
-test = ["AMDGPU", "CUDA", "ChainRulesTestUtils", "Documenter", 
+test = ["AMDGPU", "BenchmarkTools", "CUDA", "ChainRulesTestUtils", "Documenter", 
         "FiniteDifferences", "ForwardDiff", "Logging", "LoopVectorization",
         "ReverseDiff", "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN"]

From 132e35cc5608f7de900ba42a2112bdfe74a3fe44 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Wed, 27 Sep 2023 19:59:01 +0200
Subject: [PATCH 11/37] Add files via upload

---
 test/ext_loopvectorization/runtests.jl | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl
index 9c4d82b75..f94b22acd 100644
--- a/test/ext_loopvectorization/runtests.jl
+++ b/test/ext_loopvectorization/runtests.jl
@@ -34,7 +34,7 @@ end
 
 @testset "Convolution & Pooling" begin
     
-    dtype = Float32
+    dtype = Float64 # Float32
     batch_size = 64
     input = rand(dtype, 224, 224, 3, batch_size) # for conv & pool
     weight_ungrouped = rand(dtype, 5, 5, 3, 27) # for conv
@@ -67,8 +67,22 @@ end
 
     # validate conv
     @test all(isapprox.(conv_outs_std, conv_outs_lv))
-    @test all(isapprox.(conv_grads_std, conv_grads_lv))
+    # @test all(isapprox.(conv_grads_std, conv_grads_lv)) # seems to be wrong on some CI devices, reason unknown
     # validate pool
     @test all(isapprox.(pool_outs_std, pool_outs_lv))
 
+    @info isapprox(conv_grads_std[1], conv_grads_lv[1])
+    @info isapprox(conv_grads_std[2], conv_grads_lv[2])
+    @info isapprox(conv_grads_std[3], conv_grads_lv[3])
+
+    @testset "Conv impl 1" begin
+        @test isapprox(conv_grads_std[1], conv_grads_lv[1])
+    end
+    @testset "Conv impl 2" begin
+        @test isapprox(conv_grads_std[2], conv_grads_lv[2])
+    end
+    @testset "Conv impl 3" begin
+        @test isapprox(conv_grads_std[3], conv_grads_lv[3])
+    end
+
 end
\ No newline at end of file

From aa019e9c4d9dfce95d62a68748096a0e54e02d74 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Wed, 27 Sep 2023 20:26:23 +0200
Subject: [PATCH 12/37] Add files via upload

---
 test/ext_loopvectorization/runtests.jl | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl
index f94b22acd..12bb84204 100644
--- a/test/ext_loopvectorization/runtests.jl
+++ b/test/ext_loopvectorization/runtests.jl
@@ -72,8 +72,13 @@ end
     @test all(isapprox.(pool_outs_std, pool_outs_lv))
 
     @info isapprox(conv_grads_std[1], conv_grads_lv[1])
+    println(sum(conv_grads_std[1])); println(sum(conv_grads_lv[1]))
+
     @info isapprox(conv_grads_std[2], conv_grads_lv[2])
+    println(sum(conv_grads_std[2])); println(sum(conv_grads_lv[2]))
+
     @info isapprox(conv_grads_std[3], conv_grads_lv[3])
+    println(sum(conv_grads_std[3])); println(sum(conv_grads_lv[3]))
 
     @testset "Conv impl 1" begin
         @test isapprox(conv_grads_std[1], conv_grads_lv[1])

From 52e2a78a9a443c7aaebfc4e771baf730a95c5367 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Wed, 27 Sep 2023 21:07:09 +0200
Subject: [PATCH 13/37] Add files via upload

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index e1794c43f..68ea1f946 100644
--- a/Project.toml
+++ b/Project.toml
@@ -37,7 +37,7 @@ GPUArraysCore = "0.1"
 KernelAbstractions = "0.9.2"
 Requires = "1.0"
 julia = "1.9"
-# LoopVectorization = "=0.12.146"
+LoopVectorization = "=0.12.146"
 
 [extras]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"

From d63f8a53f82d5ba10ac3450feadb38d37121de6e Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Thu, 28 Sep 2023 20:05:25 +0200
Subject: [PATCH 14/37] Add files via upload

---
 test/ext_loopvectorization/minimal_test.jl | 81 ++++++++++++++++++++++
 test/ext_loopvectorization/runtests.jl     |  8 ++-
 2 files changed, 88 insertions(+), 1 deletion(-)
 create mode 100644 test/ext_loopvectorization/minimal_test.jl

diff --git a/test/ext_loopvectorization/minimal_test.jl b/test/ext_loopvectorization/minimal_test.jl
new file mode 100644
index 000000000..a40ff985c
--- /dev/null
+++ b/test/ext_loopvectorization/minimal_test.jl
@@ -0,0 +1,81 @@
+using NNlib, LoopVectorization
+
+function ∇conv_data!_avx(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
+
+    NNlib.check_dims(size(input_gradient), size(weight), size(output_gradient), cdims)
+    
+    # storing all the necessary shapes
+    output_width, output_height, out_channels, current_batch_size = size(output_gradient)
+    weight_width, weight_height, in_channels_weight, out_channels = size(weight)
+    input_width, input_height, in_channels, current_batch_size = size(input_gradient)
+
+    if cdims.padding != (0, 0, 0, 0) || cdims.groupcount != 1 || cdims.stride != (1, 1) || cdims.dilation != (1, 1)
+        throw(ArgumentError("this test function only supports basic conv (or crosscor) bwd with pad=0, stride=1, dilation=1, groups=1"))
+    end
+
+    # it's necessary to flip the kernel if real convolution is performed (flipkernel=false)
+    if !NNlib.flipkernel(cdims)
+        weight = reverse(weight, dims=(1, 2))
+    end
+
+    Threads.@threads for index_batch in 1:current_batch_size
+        @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
+            for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                input_gradient[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
+            end
+        end
+    end
+
+    return input_gradient
+end
+
+function ∇conv_data!_noavx(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
+
+    NNlib.check_dims(size(input_gradient), size(weight), size(output_gradient), cdims)
+    
+    # storing all the necessary shapes
+    output_width, output_height, out_channels, current_batch_size = size(output_gradient)
+    weight_width, weight_height, in_channels_weight, out_channels = size(weight)
+    input_width, input_height, in_channels, current_batch_size = size(input_gradient)
+
+    if cdims.padding != (0, 0, 0, 0) || cdims.groupcount != 1 || cdims.stride != (1, 1) || cdims.dilation != (1, 1)
+        throw(ArgumentError("this test function only supports basic conv (or crosscor) bwd with pad=0, stride=1, dilation=1, groups=1"))
+    end
+
+    # it's necessary to flip the kernel if real convolution is performed (flipkernel=false)
+    if !NNlib.flipkernel(cdims)
+        weight = reverse(weight, dims=(1, 2))
+    end
+
+    for index_batch in 1:current_batch_size # NO @threads here
+        for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width # NO @turbo here!
+            for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                input_gradient[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
+            end
+        end
+    end
+
+    return input_gradient
+end
+
+dtype = Float32 # Float64
+batch_size = 5
+input = rand(dtype, 50, 50, 3, batch_size)
+weight = rand(dtype, 5, 5, 3, 9)
+cdims = NNlib.DenseConvDims(size(input), size(weight), stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1)
+output_gradient = rand(dtype, NNlib.output_size(cdims)..., 9, batch_size)
+
+input_gradient_noavx = zeros(dtype, size(input)...)
+input_gradient_noavx = ∇conv_data!_noavx(input_gradient_noavx, output_gradient, weight, cdims)
+input_gradient_noavx = @time ∇conv_data!_noavx(input_gradient_noavx, output_gradient, weight, cdims)
+
+input_gradient_avx = zeros(dtype, size(input)...)
+input_gradient_avx = ∇conv_data!_avx(input_gradient_avx, output_gradient, weight, cdims)
+input_gradient_avx = @time ∇conv_data!_avx(input_gradient_avx, output_gradient, weight, cdims)
+
+@info isapprox(input_gradient_noavx, input_gradient_avx)
+@testset "conv bwd minimal" begin
+    @test isapprox(input_gradient_noavx, input_gradient_avx)
+end
+@show sum(input_gradient_noavx)
+@show sum(input_gradient_avx)
\ No newline at end of file
diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl
index 12bb84204..c715bb2f0 100644
--- a/test/ext_loopvectorization/runtests.jl
+++ b/test/ext_loopvectorization/runtests.jl
@@ -35,7 +35,7 @@ end
 @testset "Convolution & Pooling" begin
     
     dtype = Float64 # Float32
-    batch_size = 64
+    batch_size = 32 # 64
     input = rand(dtype, 224, 224, 3, batch_size) # for conv & pool
     weight_ungrouped = rand(dtype, 5, 5, 3, 27) # for conv
     weight_grouped = rand(dtype, 5, 5, 1, 27) # for grouped conv
@@ -55,12 +55,15 @@ end
 
     # compute outputs before loading LoopVectorization
 
+    #=
     println("without LoopVectorization")
     conv_outs_std, conv_grads_std = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads)
     pool_outs_std = compute_pool_outputs(pool_settings_list, input)
+    =#
 
     using LoopVectorization # now load the NNlibLoopVectorizationExt
 
+    #=
     println("with LoopVectorization")
     conv_outs_lv, conv_grads_lv = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads)
     pool_outs_lv = compute_pool_outputs(pool_settings_list, input)
@@ -89,5 +92,8 @@ end
     @testset "Conv impl 3" begin
         @test isapprox(conv_grads_std[3], conv_grads_lv[3])
     end
+    =#
+
+    include("minimal_test.jl")
 
 end
\ No newline at end of file

From ae86d13c6523282398358cb484ccfabcd721152d Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Thu, 28 Sep 2023 20:49:17 +0200
Subject: [PATCH 15/37] Add files via upload

---
 test/ext_loopvectorization/minimal_test.jl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/test/ext_loopvectorization/minimal_test.jl b/test/ext_loopvectorization/minimal_test.jl
index a40ff985c..e3750b862 100644
--- a/test/ext_loopvectorization/minimal_test.jl
+++ b/test/ext_loopvectorization/minimal_test.jl
@@ -18,7 +18,10 @@ function ∇conv_data!_avx(input_gradient::Array{T,4}, output_gradient::Array{T,
         weight = reverse(weight, dims=(1, 2))
     end
 
-    Threads.@threads for index_batch in 1:current_batch_size
+    # because in the actual computation section, values are added, it's saver to reset the given input_gradient first
+    input_gradient .= zero(T)
+
+    for index_batch in 1:current_batch_size # Threads.@threads 
         @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
             for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
                 input_gradient[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
@@ -47,6 +50,9 @@ function ∇conv_data!_noavx(input_gradient::Array{T,4}, output_gradient::Array{
         weight = reverse(weight, dims=(1, 2))
     end
 
+    # because in the actual computation section, values are added, it's saver to reset the given input_gradient first
+    input_gradient .= zero(T)
+
     for index_batch in 1:current_batch_size # NO @threads here
         for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width # NO @turbo here!
             for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width

From 776835d6e64a091bf6b06b0170912d4992096d21 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Thu, 28 Sep 2023 20:59:19 +0200
Subject: [PATCH 16/37] Add files via upload

---
 test/runtests.jl | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 5266b5f44..c78c74bcd 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -158,13 +158,11 @@ end
     end
 
     if get(ENV, "NNLIB_TEST_LOOPVECTORIZATION", "true") == "true"
-        @testset "CPU" begin
+        @testset "LoopVectorization" begin
             # Don't import LoopVectorization here! 
             # Because the LV-impls are simply tested against NNlib's standard CPU impls,
             # importing LoopVectorization here would load NNlibLoopVectorizationExt too early!
-            @testset "LoopVectorization" begin
-                include("ext_loopvectorization/runtests.jl") 
-            end
+            include("ext_loopvectorization/runtests.jl") 
         end
     else
         @info "Skipping LoopVectorization tests, set NNLIB_TEST_LOOPVECTORIZATION=true to run them."

From 13205da155cceb8f5f708041897dbf3543f57ce8 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Thu, 28 Sep 2023 21:11:15 +0200
Subject: [PATCH 17/37] Add files via upload

---
 Project.toml                               |  5 +++--
 test/ext_loopvectorization/minimal_test.jl |  4 +++-
 test/ext_loopvectorization/runtests.jl     | 10 ++++------
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/Project.toml b/Project.toml
index 68ea1f946..29e948b6f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -37,13 +37,14 @@ GPUArraysCore = "0.1"
 KernelAbstractions = "0.9.2"
 Requires = "1.0"
 julia = "1.9"
-LoopVectorization = "=0.12.146"
+# LoopVectorization = "=0.12.146"
 
 [extras]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
+CpuId = "adafc99b-e345-5852-983c-f28acb93d879"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
@@ -57,6 +58,6 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [targets]
-test = ["AMDGPU", "BenchmarkTools", "CUDA", "ChainRulesTestUtils", "Documenter", 
+test = ["AMDGPU", "BenchmarkTools", "CUDA", "ChainRulesTestUtils", "CpuId", "Documenter", 
         "FiniteDifferences", "ForwardDiff", "Logging", "LoopVectorization",
         "ReverseDiff", "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN"]
diff --git a/test/ext_loopvectorization/minimal_test.jl b/test/ext_loopvectorization/minimal_test.jl
index e3750b862..c081a8fb1 100644
--- a/test/ext_loopvectorization/minimal_test.jl
+++ b/test/ext_loopvectorization/minimal_test.jl
@@ -1,4 +1,4 @@
-using NNlib, LoopVectorization
+using NNlib, LoopVectorization, CpuId
 
 function ∇conv_data!_avx(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
 
@@ -64,6 +64,8 @@ function ∇conv_data!_noavx(input_gradient::Array{T,4}, output_gradient::Array{
     return input_gradient
 end
 
+println(cpuinfo())
+
 dtype = Float32 # Float64
 batch_size = 5
 input = rand(dtype, 50, 50, 3, batch_size)
diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl
index c715bb2f0..932a04a57 100644
--- a/test/ext_loopvectorization/runtests.jl
+++ b/test/ext_loopvectorization/runtests.jl
@@ -34,8 +34,8 @@ end
 
 @testset "Convolution & Pooling" begin
     
-    dtype = Float64 # Float32
-    batch_size = 32 # 64
+    dtype = Float32 # Float64
+    batch_size = 64 # 32
     input = rand(dtype, 224, 224, 3, batch_size) # for conv & pool
     weight_ungrouped = rand(dtype, 5, 5, 3, 27) # for conv
     weight_grouped = rand(dtype, 5, 5, 1, 27) # for grouped conv
@@ -53,17 +53,15 @@ end
         NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 0), dilation=(2, 1)), # test 'general case'
     ]
 
-    # compute outputs before loading LoopVectorization
-
     #=
+    # compute outputs before loading LoopVectorization
+    
     println("without LoopVectorization")
     conv_outs_std, conv_grads_std = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads)
     pool_outs_std = compute_pool_outputs(pool_settings_list, input)
-    =#
 
     using LoopVectorization # now load the NNlibLoopVectorizationExt
 
-    #=
     println("with LoopVectorization")
     conv_outs_lv, conv_grads_lv = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads)
     pool_outs_lv = compute_pool_outputs(pool_settings_list, input)

From 5850341872f5dc2670b5648bf2a4556f55ddb87b Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Thu, 28 Sep 2023 21:24:09 +0200
Subject: [PATCH 18/37] Add files via upload

---
 test/ext_loopvectorization/minimal_test.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/ext_loopvectorization/minimal_test.jl b/test/ext_loopvectorization/minimal_test.jl
index c081a8fb1..b401efe93 100644
--- a/test/ext_loopvectorization/minimal_test.jl
+++ b/test/ext_loopvectorization/minimal_test.jl
@@ -66,7 +66,7 @@ end
 
 println(cpuinfo())
 
-dtype = Float32 # Float64
+dtype = Float64 # Float32
 batch_size = 5
 input = rand(dtype, 50, 50, 3, batch_size)
 weight = rand(dtype, 5, 5, 3, 9)

From 00b28f2ca570553148dd846cb6b5bf4f69545243 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Thu, 28 Sep 2023 21:34:02 +0200
Subject: [PATCH 19/37] Add files via upload

---
 test/ext_loopvectorization/minimal_test.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/ext_loopvectorization/minimal_test.jl b/test/ext_loopvectorization/minimal_test.jl
index b401efe93..c081a8fb1 100644
--- a/test/ext_loopvectorization/minimal_test.jl
+++ b/test/ext_loopvectorization/minimal_test.jl
@@ -66,7 +66,7 @@ end
 
 println(cpuinfo())
 
-dtype = Float64 # Float32
+dtype = Float32 # Float64
 batch_size = 5
 input = rand(dtype, 50, 50, 3, batch_size)
 weight = rand(dtype, 5, 5, 3, 9)

From af04cc66bd91a5b71418b9f7afacf6a53128459a Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Thu, 28 Sep 2023 21:34:36 +0200
Subject: [PATCH 20/37] Delete runtests.jl

---
 runtests.jl | 173 ----------------------------------------------------
 1 file changed, 173 deletions(-)
 delete mode 100644 runtests.jl

diff --git a/runtests.jl b/runtests.jl
deleted file mode 100644
index 5266b5f44..000000000
--- a/runtests.jl
+++ /dev/null
@@ -1,173 +0,0 @@
-using NNlib, Test, Statistics, Random
-using ChainRulesCore, ChainRulesTestUtils
-using Base.Broadcast: broadcasted
-import FiniteDifferences
-import ForwardDiff
-import Zygote
-using Zygote: gradient
-using StableRNGs
-using Documenter
-using Adapt
-using KernelAbstractions
-import ReverseDiff as RD        # used in `pooling.jl`
-
-DocMeta.setdocmeta!(NNlib, :DocTestSetup, :(using NNlib, UnicodePlots); recursive=true)
-
-# ENV["NNLIB_TEST_CUDA"] = "true" # uncomment to run CUDA tests
-# ENV["NNLIB_TEST_AMDGPU"] = "true" # uncomment to run AMDGPU tests
-# ENV["NNLIB_TEST_LOOPVECTORIZATION"] = "false" # uncomment to skip LoopVectorization tests
-ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests
-
-const rng = StableRNG(123)
-include("test_utils.jl")
-
-macro conditional_testset(name, skip_tests, expr)
-    esc(quote
-        @testset $name begin
-            if $name ∉ $skip_tests
-                $expr
-            else
-                @test_skip false
-            end
-        end
-    end)
-end
-
-cpu(x) = adapt(CPU(), x)
-
-include("gather.jl")
-include("scatter.jl")
-include("upsample.jl")
-
-function nnlib_testsuite(Backend; skip_tests = Set{String}())
-    @conditional_testset "Upsample" skip_tests begin
-        upsample_testsuite(Backend)
-    end
-    @conditional_testset "Gather" skip_tests begin
-        gather_testsuite(Backend)
-    end
-    @conditional_testset "Scatter" skip_tests begin
-        scatter_testsuite(Backend)
-    end
-end
-
-@testset verbose=true "NNlib.jl" begin
-
-    if get(ENV, "NNLIB_TEST_CPU", "true") == "true"
-        @testset "CPU" begin      
-            @testset "Doctests" begin
-                doctest(NNlib, manual=false)
-            end
-
-            nnlib_testsuite(CPU)
-
-            @testset "Activation Functions" begin
-                include("activations.jl")
-                include("bias_act.jl")
-            end
-
-            @testset "Attention" begin
-                include("attention.jl")
-            end
-
-            @testset "Batched Multiplication" begin
-                include("batchedmul.jl")
-            end
-
-            @testset "Convolution" begin
-                include("conv.jl")
-                include("conv_bias_act.jl")
-            end
-
-            @testset "CTC Loss" begin
-                include("ctc.jl")
-            end
-
-            @testset "Dropout" begin
-                include("dropout.jl")
-            end
-
-            @testset "Fold/Unfold" begin
-                include("fold.jl")
-            end
-
-            @testset "Inference" begin
-                include("inference.jl")
-            end
-
-            @testset "Pooling" begin
-                include("pooling.jl")
-            end
-
-            @testset "Padding" begin
-                include("padding.jl")
-            end
-
-            @testset "Softmax" begin
-                include("softmax.jl")
-            end
-
-            @testset "Utilities" begin
-                include("utils.jl")
-            end
-
-            @testset "Grid Sampling" begin
-                include("sampling.jl")
-            end
-
-            @testset "Functions" begin
-                include("functions.jl")
-            end
-        end
-    else
-        @info "Skipping CPU tests, set NNLIB_TEST_CPU=true to run them."
-    end
-
-    if get(ENV, "NNLIB_TEST_CUDA", "false") == "true"
-        using CUDA
-        if CUDA.functional()
-            @testset "CUDA" begin
-                nnlib_testsuite(CUDABackend; skip_tests=Set(("Scatter", "Gather")))
-
-                include("ext_cuda/runtests.jl")
-            end
-        else
-            @info "Insufficient version or CUDA not found; Skipping CUDA tests"
-        end
-    else
-        @info "Skipping CUDA tests, set NNLIB_TEST_CUDA=true to run them"
-    end 
-
-    if get(ENV, "NNLIB_TEST_AMDGPU", "false") == "true"
-        using AMDGPU
-        AMDGPU.versioninfo()
-        if AMDGPU.functional() && AMDGPU.functional(:MIOpen)
-            @show AMDGPU.MIOpen.version()
-            @testset "AMDGPU" begin
-                nnlib_testsuite(ROCBackend)
-                AMDGPU.synchronize(; blocking=false)
-
-                include("ext_amdgpu/runtests.jl")
-                AMDGPU.synchronize(; blocking=false)
-            end
-        else
-            @info "AMDGPU.jl package is not functional. Skipping AMDGPU tests."
-        end
-    else
-        @info "Skipping AMDGPU tests, set NNLIB_TEST_AMDGPU=true to run them."
-    end
-
-    if get(ENV, "NNLIB_TEST_LOOPVECTORIZATION", "true") == "true"
-        @testset "CPU" begin
-            # Don't import LoopVectorization here! 
-            # Because the LV-impls are simply tested against NNlib's standard CPU impls,
-            # importing LoopVectorization here would load NNlibLoopVectorizationExt too early!
-            @testset "LoopVectorization" begin
-                include("ext_loopvectorization/runtests.jl") 
-            end
-        end
-    else
-        @info "Skipping LoopVectorization tests, set NNLIB_TEST_LOOPVECTORIZATION=true to run them."
-    end
-
-end

From 990a34c189d16bed8a6d0c52e5c6b2fc365d6f8e Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Thu, 28 Sep 2023 21:37:05 +0200
Subject: [PATCH 21/37] Delete Project.toml

---
 Project.toml | 63 ----------------------------------------------------
 1 file changed, 63 deletions(-)
 delete mode 100644 Project.toml

diff --git a/Project.toml b/Project.toml
deleted file mode 100644
index 29e948b6f..000000000
--- a/Project.toml
+++ /dev/null
@@ -1,63 +0,0 @@
-name = "NNlib"
-uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.9.6"
-
-[deps]
-Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
-ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
-KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-Requires = "ae029012-a4dd-5104-9daa-d747884805df"
-Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-
-[weakdeps]
-AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
-cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
-
-[extensions]
-NNlibAMDGPUExt = "AMDGPU"
-NNlibCUDAExt = "CUDA"
-NNlibCUDACUDNNExt = ["CUDA", "cuDNN"]
-NNlibLoopVectorizationExt = "LoopVectorization"
-
-[compat]
-AMDGPU = "0.5, 0.6"
-Adapt = "3.2"
-Atomix = "0.1"
-ChainRulesCore = "1.13"
-CUDA = "4, 5"
-cuDNN = "1"
-GPUArraysCore = "0.1"
-KernelAbstractions = "0.9.2"
-Requires = "1.0"
-julia = "1.9"
-# LoopVectorization = "=0.12.146"
-
-[extras]
-AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
-BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
-CpuId = "adafc99b-e345-5852-983c-f28acb93d879"
-Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
-FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
-ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
-Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
-LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
-ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
-StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
-Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
-cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
-
-[targets]
-test = ["AMDGPU", "BenchmarkTools", "CUDA", "ChainRulesTestUtils", "CpuId", "Documenter", 
-        "FiniteDifferences", "ForwardDiff", "Logging", "LoopVectorization",
-        "ReverseDiff", "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN"]

From db0ad6629e02828b618f5a00a378387d489a335f Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Thu, 28 Sep 2023 21:39:46 +0200
Subject: [PATCH 22/37] Add files via upload

---
 Project.toml | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 Project.toml

diff --git a/Project.toml b/Project.toml
new file mode 100644
index 000000000..29e948b6f
--- /dev/null
+++ b/Project.toml
@@ -0,0 +1,63 @@
+name = "NNlib"
+uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
+version = "0.9.6"
+
+[deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
+ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Requires = "ae029012-a4dd-5104-9daa-d747884805df"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+
+[weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
+
+[extensions]
+NNlibAMDGPUExt = "AMDGPU"
+NNlibCUDAExt = "CUDA"
+NNlibCUDACUDNNExt = ["CUDA", "cuDNN"]
+NNlibLoopVectorizationExt = "LoopVectorization"
+
+[compat]
+AMDGPU = "0.5, 0.6"
+Adapt = "3.2"
+Atomix = "0.1"
+ChainRulesCore = "1.13"
+CUDA = "4, 5"
+cuDNN = "1"
+GPUArraysCore = "0.1"
+KernelAbstractions = "0.9.2"
+Requires = "1.0"
+julia = "1.9"
+# LoopVectorization = "=0.12.146"
+
+[extras]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
+CpuId = "adafc99b-e345-5852-983c-f28acb93d879"
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
+ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
+
+[targets]
+test = ["AMDGPU", "BenchmarkTools", "CUDA", "ChainRulesTestUtils", "CpuId", "Documenter", 
+        "FiniteDifferences", "ForwardDiff", "Logging", "LoopVectorization",
+        "ReverseDiff", "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN"]

From a4e18e6b8311afc88c9587b55954884d8cef8a42 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Fri, 29 Sep 2023 14:23:05 +0200
Subject: [PATCH 23/37] Add files via upload

---
 Project.toml | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/Project.toml b/Project.toml
index 29e948b6f..d267c698f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,26 +16,29 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [weakdeps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
-cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
+cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 
 [extensions]
 NNlibAMDGPUExt = "AMDGPU"
-NNlibCUDAExt = "CUDA"
 NNlibCUDACUDNNExt = ["CUDA", "cuDNN"]
+NNlibCUDAExt = "CUDA"
+NNlibEnzymeCoreExt = "EnzymeCore"
 NNlibLoopVectorizationExt = "LoopVectorization"
 
 [compat]
 AMDGPU = "0.5, 0.6"
 Adapt = "3.2"
 Atomix = "0.1"
-ChainRulesCore = "1.13"
 CUDA = "4, 5"
-cuDNN = "1"
+ChainRulesCore = "1.13"
+EnzymeCore = "0.5, 0.6"
 GPUArraysCore = "0.1"
 KernelAbstractions = "0.9.2"
 Requires = "1.0"
+cuDNN = "1"
 julia = "1.9"
 # LoopVectorization = "=0.12.146"
 
@@ -46,6 +49,9 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 CpuId = "adafc99b-e345-5852-983c-f28acb93d879"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
+EnzymeTestUtils = "12d8515a-0907-448a-8884-5fe00fdf1c5a"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
@@ -58,6 +64,5 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [targets]
-test = ["AMDGPU", "BenchmarkTools", "CUDA", "ChainRulesTestUtils", "CpuId", "Documenter", 
-        "FiniteDifferences", "ForwardDiff", "Logging", "LoopVectorization",
-        "ReverseDiff", "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN"]
+test = ["AMDGPU", "BenchmarkTools", "CUDA", "ChainRulesTestUtils", "CpuId", "Documenter", "FiniteDifferences", "ForwardDiff", "Logging", "LoopVectorization", "ReverseDiff", "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN",
+		"Enzyme", "EnzymeCore", "EnzymeTestUtils"]

From f58437714575bcd710ec3dc78ba0acd5fc3f03ef Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Sat, 30 Sep 2023 23:10:40 +0200
Subject: [PATCH 24/37] Add files via upload

---
 .../NNlibLoopVectorizationExt.jl              |  5 ++
 ext/NNlibLoopVectorizationExt/conv.jl         | 76 +++++++++++++++++--
 ext/NNlibLoopVectorizationExt/pooling.jl      |  2 +-
 3 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl b/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl
index d2053161c..3be6c28a2 100644
--- a/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl
+++ b/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl
@@ -3,6 +3,11 @@ module NNlibLoopVectorizationExt
 using NNlib
 using LoopVectorization
 using Random, Statistics
+using OffsetArrays, Static
+
+# Bevor die Methoden überladen werden, sollte ein Selfcheck durchgeführt werden, ob die Ergebnisse mit NNlib übereinstimmen.
+# Wenn nicht, sollte eine Warnung angezeigt werden und die wenn irgendwie möglich, nur die funktionierenden Methoden verwendet werden.
+# Z.b. in dem bei falschem Ergebniss, die im2col Methode als Backend in der überladenen Methode aufgerufen wird.
 
 include("conv.jl")
 include("pooling.jl")
diff --git a/ext/NNlibLoopVectorizationExt/conv.jl b/ext/NNlibLoopVectorizationExt/conv.jl
index 3aafe9246..dd59ecb0c 100644
--- a/ext/NNlibLoopVectorizationExt/conv.jl
+++ b/ext/NNlibLoopVectorizationExt/conv.jl
@@ -6,8 +6,11 @@ Using stride > 1, dilation > 1 or groups > 1 can slow down things a bit.
 Since the current state of LoopVectorization ∇conv_filter! isn't really faster than the 
 original implementation in some situations, it is left out for the moment.
 
-Implementation copied from here (Jonas Steinebach, MIT):
+Implementation for forward pass mostly copied from here (Jonas Steinebach, MIT license):
 https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_convolution.jl
+
+Implementation for backward pass mostly copied from here (Chris Elrod, MIT license):
+https://github.com/PumasAI/SimpleChains.jl/blob/main/src/conv.jl
 =#
 
 function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
@@ -123,21 +126,44 @@ function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{
     x_dilation, y_dilation = cdims.dilation
     out_channels_per_group = out_channels ÷ groups
 
+    @inline static_size(x::AbstractArray{T, N}) where {T, N} = static.(size(x))
+
     # actual computation (using @tturbo instead of Threads.@threads + @turbo may end up in wrong results)
     if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
         # println("backward: very specialized case for maximum performance")
 
-        Threads.@threads for index_batch in 1:current_batch_size
-            @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
-                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
-                    input_gradient_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
+        output_gradient = OffsetArray(output_gradient, OffsetArrays.Origin(0, 0, 0, 0))
+        input_gradient_padded = OffsetArray(input_gradient_padded, OffsetArrays.Origin(0, 0, 0, 0))
+        weight = OffsetArray(weight, OffsetArrays.Origin(0, 0, 0, 0))
+
+        input_width, input_height, in_channels, batch_size = static_size(input_gradient_padded)
+        weight_width, weight_height, in_channels_weight, out_channels = static_size(weight)
+
+        y_upper_bound = static(output_height) # input_width - weight_width + static(1)
+        x_upper_bound = static(output_width) # input_height - weight_height + static(1)
+
+        @tturbo for index_batch in 0:batch_size-1
+            for x_in in 0:input_width-1, y_in in 0:input_height-1, in_channel in 0:in_channels-1 # @tturbo unroll = (2, 1) 
+
+                value = zero(T)
+                for x_w in 0:weight_width-1, y_w in 0:weight_height-1, out_channel in 0:out_channels-1
+                    ib0 = (x_in - x_w >= 0) & (x_in - x_w < x_upper_bound)
+                    ib1 = (y_in - y_w >= 0) & (y_in - y_w < y_upper_bound)
+                    output_gradient_value = (ib0 & ib1) ? output_gradient[x_in-x_w, y_in-y_w, out_channel, index_batch] : zero(T)
+                    value += weight[x_w, y_w, in_channel, out_channel] * output_gradient_value
+                    # value += (ib0 & ib1) ? output_gradient[x_in-x_w, y_in-y_w, out_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] : zero(T)
                 end
+                input_gradient_padded[x_in, y_in, in_channel, index_batch] = value
+
             end
         end
 
-    elseif groups == 1 # second specialized case for better performance
+        input_gradient_padded = input_gradient_padded.parent
+
+    elseif groups == 1 && cdims.dilation == (1, 1) # second specialized case for better performance
         # println("backward: second specialized case for better performance")
 
+        #=
         Threads.@threads for index_batch in 1:current_batch_size
             @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
                 m = y_out + (y_stride - 1) * (y_out - 1)
@@ -149,6 +175,44 @@ function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{
                 end
             end
         end
+        =#
+
+        y_out_indices = Array{Int, 3}(undef, weight_width, weight_height, input_height)
+        x_out_indices = Array{Int, 3}(undef, weight_width, weight_height, input_width)
+        x_out_indices .= -1
+        y_out_indices .= -1
+
+        @turbo for y_out in 1:output_height, x_out in 1:output_width # 
+            m = y_out + (y_stride - 1) * (y_out - 1)
+            n = x_out + (x_stride - 1) * (x_out - 1)
+            for y_w in 1:weight_height, x_w in 1:weight_width
+                y_in = m + (y_w - 1) * y_dilation
+                x_in = n + (x_w - 1) * x_dilation
+                y_out_indices[x_w, y_w, y_in] = y_out 
+                x_out_indices[x_w, y_w, x_in] = x_out
+            end
+        end
+
+        @tturbo for index_batch in 1:current_batch_size
+            for x_in in 1:input_width, y_in in 1:input_height, in_channel in 1:in_channels # @tturbo unroll = (2, 1) 
+
+                value = zero(T)
+                for x_w in 1:weight_width, y_w in 1:weight_height, out_channel in 1:out_channels
+
+                    x_out = x_out_indices[x_w, y_w, x_in]
+                    y_out = y_out_indices[x_w, y_w, y_in]
+
+                    ib0 = x_out > -1 # !=
+                    ib1 = y_out > -1 # !=
+
+                    output_gradient_value = (ib0 & ib1) ? output_gradient[x_out, y_out, out_channel, index_batch] : zero(T)
+                    # output_gradient_value = T(2.0) # output_gradient[x_out, y_out, out_channel, index_batch]
+                    value += weight[x_w, y_w, in_channel, out_channel] * output_gradient_value
+                end
+                input_gradient[x_in, y_in, in_channel, index_batch] = value
+
+            end
+        end
 
     else # general case for any convolution 
         # println("backward: general case for any convolution")
diff --git a/ext/NNlibLoopVectorizationExt/pooling.jl b/ext/NNlibLoopVectorizationExt/pooling.jl
index dc948335b..420bb7413 100644
--- a/ext/NNlibLoopVectorizationExt/pooling.jl
+++ b/ext/NNlibLoopVectorizationExt/pooling.jl
@@ -5,7 +5,7 @@ The speed up is usually lower compared to conv but can be approximately up to 2x
 Since the current state of LoopVectorization ∇meanpool! isn't really faster than the 
 original implementation in some situations, it is left out for the moment.
 
-Implementation inspired from here (Jonas Steinebach, MIT):
+Implementation inspired from here (Jonas Steinebach, MIT license):
 https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_pooling.jl
 =#
 

From 274db102b4a2e5643df96e1ed0415719b30e212a Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Sat, 30 Sep 2023 23:10:53 +0200
Subject: [PATCH 25/37] Add files via upload

---
 Project.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index d267c698f..fe9314f97 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "NNlib"
 uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-version = "0.9.6"
+version = "0.9.7"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -9,9 +9,11 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
+Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [weakdeps]

From 6c33d5cb94505acde722b64d2a49adb3571ad666 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Sat, 30 Sep 2023 23:11:32 +0200
Subject: [PATCH 26/37] Add files via upload

---
 test/ext_loopvectorization/runtests.jl | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl
index 932a04a57..f9b5bf867 100644
--- a/test/ext_loopvectorization/runtests.jl
+++ b/test/ext_loopvectorization/runtests.jl
@@ -35,7 +35,7 @@ end
 @testset "Convolution & Pooling" begin
     
     dtype = Float32 # Float64
-    batch_size = 64 # 32
+    batch_size = 64 # 1 # 64 # 32
     input = rand(dtype, 224, 224, 3, batch_size) # for conv & pool
     weight_ungrouped = rand(dtype, 5, 5, 3, 27) # for conv
     weight_grouped = rand(dtype, 5, 5, 1, 27) # for grouped conv
@@ -53,7 +53,6 @@ end
         NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 0), dilation=(2, 1)), # test 'general case'
     ]
 
-    #=
     # compute outputs before loading LoopVectorization
     
     println("without LoopVectorization")
@@ -90,8 +89,5 @@ end
     @testset "Conv impl 3" begin
         @test isapprox(conv_grads_std[3], conv_grads_lv[3])
     end
-    =#
-
-    include("minimal_test.jl")
 
 end
\ No newline at end of file

From 7affd464c174413d8d24ff05f5ef7b30041a0862 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Tue, 3 Oct 2023 14:42:48 +0200
Subject: [PATCH 27/37] Add files via upload

---
 ext/NNlibLoopVectorizationExt/conv.jl | 455 ++++++++++++++++++--------
 1 file changed, 327 insertions(+), 128 deletions(-)

diff --git a/ext/NNlibLoopVectorizationExt/conv.jl b/ext/NNlibLoopVectorizationExt/conv.jl
index dd59ecb0c..e0876db0f 100644
--- a/ext/NNlibLoopVectorizationExt/conv.jl
+++ b/ext/NNlibLoopVectorizationExt/conv.jl
@@ -1,18 +1,273 @@
-#=
-Accelerated convolution for 2d-images using the power of LoopVectorization.
-The acceleration is usually greatest when the inputs have a large spatial size and few channels. 
-Using stride > 1, dilation > 1 or groups > 1 can slow down things a bit.
+# # pad naivly, static iters
+function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real}
+
+    # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise)
+    size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
+    cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
+    NNlib.check_dims(size(input), size_weight_check_dims, size(output), cdims_check_dims)
+
+    # padding is done naively at the moment
+    if cdims.padding != (0, 0, 0, 0)
+        input = NNlib.pad_zeros(input, cdims.padding, dims=(1, 2))
+    end
+
+    output_width, output_height, _ = size(output)
+    input_width, input_height, in_channels, batch_size = size(input)
+    weight_width, weight_height, in_channels_weight, out_channels = size(weight)
+
+    # it's necessary to flip the kernel if real convolution is performed (flipkernel=false)
+    if !NNlib.flipkernel(cdims)
+        weight = reverse(weight, dims=(1, 2))
+    end
+
+    groups = cdims.groupcount
+    x_stride, y_stride = cdims.stride
+    x_dilation, y_dilation = cdims.dilation
+    out_channels_per_group = out_channels ÷ groups
+
+    if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
+        # println("forward: very specialized case for maximum performance")
+
+        @tturbo for index_batch in 1:batch_size
+            for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
+                value = zero(T)
+                for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                    value += input[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
+                end
+                output[x_out, y_out, out_channel, index_batch] = value
+            end
+        end
+
+    elseif groups == 1 && cdims.dilation == (1, 1) # second specialized case for better performance
+        # println("forward: second specialized case for better performance")
+
+        @tturbo for index_batch in 1:batch_size
+            for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
+                m = y_out + static((y_stride - 1)) * (y_out - 1)
+                n = x_out + static((x_stride - 1)) * (x_out - 1)
+                value = zero(T)
+                for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                    # y_in = m + (y_w - 1)
+                    # x_in = n + (x_w - 1)
+                    value += input[n + (x_w - 1), m + (y_w - 1), in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
+                end
+                output[x_out, y_out, out_channel, index_batch] = value
+            end
+        end
+
+    elseif groups == 1 # third specialized case for better performance
+        # println("forward: third specialized case for better performance")
+
+        @tturbo for index_batch in 1:batch_size
+            for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
+                m = y_out + static((y_stride - 1)) * (y_out - 1)
+                n = x_out + static((x_stride - 1)) * (x_out - 1)
+                value = zero(T)
+                for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                    # y_in = m + (y_w - 1) * y_dilation
+                    # x_in = n + (x_w - 1) * x_dilation
+                    value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
+                end
+                output[x_out, y_out, out_channel, index_batch] = value
+            end
+        end
+
+    else # general case for any convolution
+        # println("forward: general case for any convolution")
+
+        @tturbo for index_batch in 1:batch_size
+            for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in 1:output_height, x_out in 1:output_width
+                m = y_out + static((y_stride - 1)) * (y_out - 1)
+                n = x_out + static((x_stride - 1)) * (x_out - 1)
+                out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
+                value = zero(T)
+                for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                    # y_in = m + (y_w - 1) * y_dilation
+                    # x_in = n + (x_w - 1) * x_dilation
+                    in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
+                    value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel]
+                end
+                output[x_out, y_out, out_channel, index_batch] = value
+            end
+        end
+
+    end
+
+    return output
+end
+
+#= # pad bounds check
+function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
+
+    # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise)
+    size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
+    cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
+    NNlib.check_dims(size(input), size_weight_check_dims, size(output), cdims_check_dims)
+
+    output_width, output_height, _ = size(output)
+    input_width, input_height, in_channels, batch_size = size(input)
+    weight_width, weight_height, in_channels_weight, out_channels = size(weight)
+
+    # it's necessary to flip the kernel if real convolution is performed (flipkernel=false)
+    if !NNlib.flipkernel(cdims)
+        weight = reverse(weight, dims=(1, 2))
+    end
+
+    groups = cdims.groupcount
+    x_stride, y_stride = cdims.stride
+    x_dilation, y_dilation = cdims.dilation
+    x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding
+    out_channels_per_group = out_channels ÷ groups
+
+    # We use calc_padding_regions to split outselves up into separate regions that may or
+    # may not need to worry about padding:
+    cdims_3d = DenseConvDims((input_width, input_height, 1, in_channels, batch_size), (weight_width, weight_height, 1, in_channels_weight, out_channels), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1))
+    # println(pdims_3d.padding)
+    padded_regions, central_region = NNlib.calc_padding_regions(cdims_3d)
+
+    # Start with the central region
+    w_region, h_region, _ = central_region
+
+    if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
+        # println("forward: very specialized case for maximum performance")
+
+        @tturbo for index_batch in 1:batch_size
+            for out_channel in 1:out_channels, y_out in h_region, x_out in w_region
+                value = zero(T)
+                for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                    value += input[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
+                end
+                output[x_out, y_out, out_channel, index_batch] = value
+            end
+        end
+
+    elseif groups == 1 && cdims.dilation == (1, 1) # second specialized case for better performance
+        # println("forward: second specialized case for better performance")
+
+        @tturbo for index_batch in 1:batch_size
+            for out_channel in 1:out_channels, y_out in h_region, x_out in w_region
+                m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
+                n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
+                value = zero(T)
+                for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                    # y_in = m + (y_w - 1)
+                    # x_in = n + (x_w - 1)
+                    value += input[n + (x_w - 1), m + (y_w - 1), in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
+                end
+                output[x_out, y_out, out_channel, index_batch] = value
+            end
+        end
+
+    elseif groups == 1 # third specialized case for better performance
+        # println("forward: third specialized case for better performance")
+
+        @tturbo for index_batch in 1:batch_size
+            for out_channel in 1:out_channels, y_out in h_region, x_out in w_region
+                m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
+                n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
+                value = zero(T)
+                for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                    # y_in = m + (y_w - 1) * y_dilation
+                    # x_in = n + (x_w - 1) * x_dilation
+                    value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
+                end
+                output[x_out, y_out, out_channel, index_batch] = value
+            end
+        end
+
+    else # general case for any convolution
+        # println("forward: general case for any convolution")
+
+        @tturbo for index_batch in 1:batch_size
+            for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in h_region, x_out in w_region
+                m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
+                n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
+                out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
+                value = zero(T)
+                for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                    # y_in = m + (y_w - 1) * y_dilation
+                    # x_in = n + (x_w - 1) * x_dilation
+                    in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
+                    value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel]
+                end
+                output[x_out, y_out, out_channel, index_batch] = value
+            end
+        end
+
+    end
+
+    # @show w_region # 1:0 warning: when padding is unequal and one of x_pad1 or y_pad1 is 0, emty collections are possible
+    # @show h_region # 1:0 if isempty(1:0) -> true
+    # println()
+
+    if cdims.padding != (0, 0, 0, 0)
+        # Next, the padded regions
+        for (w_region, h_region, d_region) in padded_regions # @inbounds 
+            for z_out in d_region # for skipping the d_regions
+
+                if cdims.groupcount == 1
+                
+                    @tturbo for index_batch in 1:batch_size # @turbo
+                        for out_channel in 1:out_channels, y_out in h_region, x_out in w_region
+                            m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
+                            n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
+                            value = zero(T)
+                            for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+
+                                y_in = m + (y_w - 1) * y_dilation
+                                x_in = n + (x_w - 1) * x_dilation
 
-Since the current state of LoopVectorization ∇conv_filter! isn't really faster than the 
-original implementation in some situations, it is left out for the moment.
+                                is_in_bound_x = (x_in >= 1) & (x_in <= input_width)
+                                is_in_bound_y = (y_in >= 1) & (y_in <= input_height)
 
-Implementation for forward pass mostly copied from here (Jonas Steinebach, MIT license):
-https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_convolution.jl
+                                input_value = (is_in_bound_x & is_in_bound_y) ? input[x_in, y_in, in_channel, index_batch] : zero(T)
+                                value += input_value * weight[x_w, y_w, in_channel, out_channel]
 
-Implementation for backward pass mostly copied from here (Chris Elrod, MIT license):
-https://github.com/PumasAI/SimpleChains.jl/blob/main/src/conv.jl
+                                # value += (ib0 & ib1) ? input[x_in, y_in, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] : zero(T)
+
+                            end
+                            output[x_out, y_out, out_channel, index_batch] = value
+                        end
+                    end
+
+                else
+
+                    @tturbo for index_batch in 1:batch_size # @turbo
+                        for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in h_region, x_out in w_region
+                            m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
+                            n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
+                            out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
+                            value = zero(T)
+                            for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                                
+                                y_in = m + (y_w - 1) * y_dilation
+                                x_in = n + (x_w - 1) * x_dilation
+
+                                in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
+
+                                is_in_bound_x = (x_in >= 1) & (x_in <= input_width)
+                                is_in_bound_y = (y_in >= 1) & (y_in <= input_height)
+
+                                input_value = (is_in_bound_x & is_in_bound_y) ? input[x_in, y_in, in_channel_input, index_batch] : zero(T)
+                                value += input_value * weight[x_w, y_w, in_channel_weight, out_channel]
+
+                                # value += (ib0 & ib1) ? input[x_in, y_in, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel] : zero(T)
+
+                            end
+                            output[x_out, y_out, out_channel, index_batch] = value
+                        end
+                    end
+
+                end
+            end
+        end
+    end
+
+    return output
+end
 =#
 
+#=
 function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
 
     # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise)
@@ -92,66 +347,81 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4},
 
     return output
 end
+=#
+
+function ∇conv_data_im2col_grouped!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real}
+
+    ∇conv_data!(
+        NNlib.insert_singleton_spatial_dimension(input_gradient, 1),
+        NNlib.insert_singleton_spatial_dimension(output_gradient, 1),
+        NNlib.insert_singleton_spatial_dimension(weight, 1),
+        NNlib.insert_singleton_spatial_dimension(cdims, 1)
+    )
+
+    return input_gradient
+end
+
+function ∇conv_filter_im2col_grouped!(weight_gradient::Array{T,4}, input::Array{T,4}, output_gradient::Array{T,4}, cdims::ConvDims) where {T<:Real}
+
+    ∇conv_filter!(
+        NNlib.insert_singleton_spatial_dimension(weight_gradient, 1),
+        NNlib.insert_singleton_spatial_dimension(input, 1),
+        NNlib.insert_singleton_spatial_dimension(output_gradient, 1),
+        NNlib.insert_singleton_spatial_dimension(cdims, 1)
+    )
+
+    return weight_gradient
+end
+
+function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real}
 
-function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
-    
     # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise)
     size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
     cdims_check_dims = DenseConvDims(size(input_gradient), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
     NNlib.check_dims(size(input_gradient), size_weight_check_dims, size(output_gradient), cdims_check_dims)
-    
-    # storing all the necessary shapes
-    output_width, output_height, out_channels, current_batch_size = size(output_gradient)
-    weight_width, weight_height, in_channels_weight, out_channels = size(weight)
-
-    # because in the actual computation section, values are added, it's saver to reset the given input_gradient first
-    input_gradient .= zero(T)
-    # check if input_gradient must be padded (padding is done naively at the moment)
-    if cdims.padding != (0, 0, 0, 0)
-        input_gradient_padded = NNlib.pad_zeros(input_gradient, cdims.padding, dims=(1, 2))
-    else
-        input_gradient_padded = input_gradient
-    end
 
-    # store the size of input after padding 
-    input_width, input_height, in_channels, current_batch_size = size(input_gradient_padded) # size after padding
-
-    # it's necessary to flip the kernel if real convolution is performed (flipkernel=false)
-    if !NNlib.flipkernel(cdims)
-        weight = reverse(weight, dims=(1, 2))
-    end
+    if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
+        # println("backward: very specialized case for maximum performance")
 
-    groups = cdims.groupcount
-    x_stride, y_stride = cdims.stride
-    x_dilation, y_dilation = cdims.dilation
-    out_channels_per_group = out_channels ÷ groups
+        # storing all the necessary shapes
+        output_width, output_height, out_channels, batch_size = size(output_gradient)
+        weight_width, weight_height, in_channels_weight, out_channels = size(weight)
+
+        # because in the actual computation section, values are added, it's saver to reset the given input_gradient first
+        input_gradient .= zero(T)
+        # check if input_gradient must be padded (padding is done naively at the moment)
+        if cdims.padding != (0, 0, 0, 0)
+            input_gradient_padded = NNlib.pad_zeros(input_gradient, cdims.padding, dims=(1, 2))
+        else
+            input_gradient_padded = input_gradient
+        end
 
-    @inline static_size(x::AbstractArray{T, N}) where {T, N} = static.(size(x))
+        # store the size of input after padding 
+        input_width, input_height, in_channels, batch_size = size(input_gradient_padded) # size after padding
 
-    # actual computation (using @tturbo instead of Threads.@threads + @turbo may end up in wrong results)
-    if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
-        # println("backward: very specialized case for maximum performance")
+        # it's necessary to flip the kernel if real convolution is performed (flipkernel=false)
+        if !NNlib.flipkernel(cdims)
+            weight = reverse(weight, dims=(1, 2))
+        end
 
         output_gradient = OffsetArray(output_gradient, OffsetArrays.Origin(0, 0, 0, 0))
         input_gradient_padded = OffsetArray(input_gradient_padded, OffsetArrays.Origin(0, 0, 0, 0))
         weight = OffsetArray(weight, OffsetArrays.Origin(0, 0, 0, 0))
 
-        input_width, input_height, in_channels, batch_size = static_size(input_gradient_padded)
-        weight_width, weight_height, in_channels_weight, out_channels = static_size(weight)
-
-        y_upper_bound = static(output_height) # input_width - weight_width + static(1)
-        x_upper_bound = static(output_width) # input_height - weight_height + static(1)
+        input_width, input_height, in_channels, batch_size = size(input_gradient_padded)
+        weight_width, weight_height, in_channels_weight, out_channels = size(weight)
 
         @tturbo for index_batch in 0:batch_size-1
-            for x_in in 0:input_width-1, y_in in 0:input_height-1, in_channel in 0:in_channels-1 # @tturbo unroll = (2, 1) 
+            for x_in in 0:input_width-1, y_in in 0:input_height-1, in_channel in 0:in_channels-1
 
                 value = zero(T)
-                for x_w in 0:weight_width-1, y_w in 0:weight_height-1, out_channel in 0:out_channels-1
-                    ib0 = (x_in - x_w >= 0) & (x_in - x_w < x_upper_bound)
-                    ib1 = (y_in - y_w >= 0) & (y_in - y_w < y_upper_bound)
-                    output_gradient_value = (ib0 & ib1) ? output_gradient[x_in-x_w, y_in-y_w, out_channel, index_batch] : zero(T)
+                for x_w in static(0):static(weight_width-1), y_w in static(0):static(weight_height-1), out_channel in static(0):static(out_channels-1)
+
+                    is_in_bound_x = (x_in - x_w >= 0) & (x_in - x_w < output_width)
+                    is_in_bound_y = (y_in - y_w >= 0) & (y_in - y_w < output_height)
+                    output_gradient_value = (is_in_bound_x & is_in_bound_y) ? output_gradient[x_in - x_w, y_in - y_w, out_channel, index_batch] : zero(T)
                     value += weight[x_w, y_w, in_channel, out_channel] * output_gradient_value
-                    # value += (ib0 & ib1) ? output_gradient[x_in-x_w, y_in-y_w, out_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] : zero(T)
+                    
                 end
                 input_gradient_padded[x_in, y_in, in_channel, index_batch] = value
 
@@ -160,86 +430,15 @@ function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{
 
         input_gradient_padded = input_gradient_padded.parent
 
-    elseif groups == 1 && cdims.dilation == (1, 1) # second specialized case for better performance
-        # println("backward: second specialized case for better performance")
-
-        #=
-        Threads.@threads for index_batch in 1:current_batch_size
-            @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
-                m = y_out + (y_stride - 1) * (y_out - 1)
-                n = x_out + (x_stride - 1) * (x_out - 1)
-                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
-                    y_in = m + (y_w - 1) * y_dilation
-                    x_in = n + (x_w - 1) * x_dilation
-                    input_gradient_padded[x_in, y_in, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
-                end
-            end
-        end
-        =#
-
-        y_out_indices = Array{Int, 3}(undef, weight_width, weight_height, input_height)
-        x_out_indices = Array{Int, 3}(undef, weight_width, weight_height, input_width)
-        x_out_indices .= -1
-        y_out_indices .= -1
-
-        @turbo for y_out in 1:output_height, x_out in 1:output_width # 
-            m = y_out + (y_stride - 1) * (y_out - 1)
-            n = x_out + (x_stride - 1) * (x_out - 1)
-            for y_w in 1:weight_height, x_w in 1:weight_width
-                y_in = m + (y_w - 1) * y_dilation
-                x_in = n + (x_w - 1) * x_dilation
-                y_out_indices[x_w, y_w, y_in] = y_out 
-                x_out_indices[x_w, y_w, x_in] = x_out
-            end
-        end
-
-        @tturbo for index_batch in 1:current_batch_size
-            for x_in in 1:input_width, y_in in 1:input_height, in_channel in 1:in_channels # @tturbo unroll = (2, 1) 
-
-                value = zero(T)
-                for x_w in 1:weight_width, y_w in 1:weight_height, out_channel in 1:out_channels
-
-                    x_out = x_out_indices[x_w, y_w, x_in]
-                    y_out = y_out_indices[x_w, y_w, y_in]
-
-                    ib0 = x_out > -1 # !=
-                    ib1 = y_out > -1 # !=
-
-                    output_gradient_value = (ib0 & ib1) ? output_gradient[x_out, y_out, out_channel, index_batch] : zero(T)
-                    # output_gradient_value = T(2.0) # output_gradient[x_out, y_out, out_channel, index_batch]
-                    value += weight[x_w, y_w, in_channel, out_channel] * output_gradient_value
-                end
-                input_gradient[x_in, y_in, in_channel, index_batch] = value
-
-            end
+        # depad 
+        if cdims.padding != (0, 0, 0, 0)
+            x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding
+            input_gradient .= input_gradient_padded[x_pad1+1:input_width-x_pad2, y_pad1+1:input_height-y_pad2, :, :]
         end
 
     else # general case for any convolution 
-        # println("backward: general case for any convolution")
-
-        Threads.@threads for index_batch in 1:current_batch_size
-            for out_channel_per_group in 1:out_channels_per_group # putting @turbo here may end up in wrong results
-                @turbo for group in 1:groups, y_out in 1:output_height, x_out in 1:output_width
-                    m = y_out + (y_stride - 1) * (y_out - 1)
-                    n = x_out + (x_stride - 1) * (x_out - 1)
-                    out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
-                    for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width
-                        y_in = m + (y_w - 1) * y_dilation
-                        x_in = n + (x_w - 1) * x_dilation
-                        in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
-                        input_gradient_padded[x_in, y_in, in_channel_input, index_batch] += weight[x_w, y_w, in_channel_weight, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
-                    end
-                end
-            end
-        end
-
-    end
-
-    # depad 
-    if cdims.padding != (0, 0, 0, 0)
-        x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding
-        input_gradient .= input_gradient_padded[x_pad1+1:input_width-x_pad2, y_pad1+1:input_height-y_pad2, :, :]
-    end
+        input_gradient = ∇conv_data_im2col_grouped!(input_gradient, output_gradient, weight, cdims)
+    end        
 
     return input_gradient
 end
\ No newline at end of file

From 3130f8a02ced2271a433e31728f8b11cfbd0e9b9 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Tue, 3 Oct 2023 14:43:33 +0200
Subject: [PATCH 28/37] Add files via upload

---
 test/ext_loopvectorization/runtests.jl | 30 ++++++--------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl
index f9b5bf867..262ae42d0 100644
--- a/test/ext_loopvectorization/runtests.jl
+++ b/test/ext_loopvectorization/runtests.jl
@@ -42,15 +42,18 @@ end
     
     conv_settings_list = [
         NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1), # test 'very specialized case'
-        NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(2, 1), padding=(0, 0), dilation=(2, 1), groups=1), # test 'second specialized case'
-        NNlib.DenseConvDims(size(input), size(weight_grouped), stride=(2, 1), padding=(2, 0), dilation=(2, 1), groups=3), # test 'general case'
+        NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(2, 1), padding=(0, 0), dilation=(1, 1), groups=1), # test 'second specialized case'
+        NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(2, 1), padding=(0, 0), dilation=(2, 1), groups=1), # test 'third specialized case'
+        ### NNlib.DenseConvDims(size(input), size(weight_grouped), stride=(2, 1), padding=(2, 0), dilation=(2, 1), groups=3), # test 'general case'
+        NNlib.DenseConvDims(size(input), size(weight_grouped), stride=(2, 1), padding=(2, 1), dilation=(2, 1), groups=3), # test 'general case'
     ]
 
     conv_output_grads = [rand(dtype, NNlib.output_size(setting)..., 27, batch_size) for setting in conv_settings_list]
 
     pool_settings_list = [
         NNlib.PoolDims(size(input), (5, 4), stride=(1, 1), padding=(0, 0), dilation=(1, 1)), # test 'specialized case'
-        NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 0), dilation=(2, 1)), # test 'general case'
+        # NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 0), dilation=(2, 1)), # test 'general case'
+        NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 1), dilation=(2, 1)), # test 'general case'
     ]
 
     # compute outputs before loading LoopVectorization
@@ -67,27 +70,8 @@ end
 
     # validate conv
     @test all(isapprox.(conv_outs_std, conv_outs_lv))
-    # @test all(isapprox.(conv_grads_std, conv_grads_lv)) # seems to be wrong on some CI devices, reason unknown
+    @test all(isapprox.(conv_grads_std, conv_grads_lv))
     # validate pool
     @test all(isapprox.(pool_outs_std, pool_outs_lv))
 
-    @info isapprox(conv_grads_std[1], conv_grads_lv[1])
-    println(sum(conv_grads_std[1])); println(sum(conv_grads_lv[1]))
-
-    @info isapprox(conv_grads_std[2], conv_grads_lv[2])
-    println(sum(conv_grads_std[2])); println(sum(conv_grads_lv[2]))
-
-    @info isapprox(conv_grads_std[3], conv_grads_lv[3])
-    println(sum(conv_grads_std[3])); println(sum(conv_grads_lv[3]))
-
-    @testset "Conv impl 1" begin
-        @test isapprox(conv_grads_std[1], conv_grads_lv[1])
-    end
-    @testset "Conv impl 2" begin
-        @test isapprox(conv_grads_std[2], conv_grads_lv[2])
-    end
-    @testset "Conv impl 3" begin
-        @test isapprox(conv_grads_std[3], conv_grads_lv[3])
-    end
-
 end
\ No newline at end of file

From d87f909e93239f83a570a846a2e2ba4340216392 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Sat, 7 Oct 2023 23:08:58 +0200
Subject: [PATCH 29/37] Add files via upload

---
 ext/NNlibLoopVectorizationExt/conv.jl | 166 ++++++++++++++++++++++----
 1 file changed, 143 insertions(+), 23 deletions(-)

diff --git a/ext/NNlibLoopVectorizationExt/conv.jl b/ext/NNlibLoopVectorizationExt/conv.jl
index e0876db0f..a75026b82 100644
--- a/ext/NNlibLoopVectorizationExt/conv.jl
+++ b/ext/NNlibLoopVectorizationExt/conv.jl
@@ -1,6 +1,10 @@
-# # pad naivly, static iters
+# pad naivly, static iters
 function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real}
 
+    # if cdims.stride != (1, 1)
+        # return conv_im2col_grouped!(output, input, weight, cdims)
+    # end
+
     # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise)
     size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
     cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
@@ -32,6 +36,7 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4},
             for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
                 value = zero(T)
                 for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                # for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
                     value += input[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
                 end
                 output[x_out, y_out, out_channel, index_batch] = value
@@ -43,13 +48,16 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4},
 
         @tturbo for index_batch in 1:batch_size
             for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
-                m = y_out + static((y_stride - 1)) * (y_out - 1)
-                n = x_out + static((x_stride - 1)) * (x_out - 1)
+                # m = y_out + static((y_stride - 1)) * (y_out - 1)
+                # n = x_out + static((x_stride - 1)) * (x_out - 1)
+                m = y_out + (y_stride - 1) * (y_out - 1)
+                n = x_out + (x_stride - 1) * (x_out - 1)
                 value = zero(T)
                 for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
                     # y_in = m + (y_w - 1)
                     # x_in = n + (x_w - 1)
-                    value += input[n + (x_w - 1), m + (y_w - 1), in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
+                    # value += input[n + (x_w - 1), m + (y_w - 1), in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
+                    value += input[n + x_w - 1, m + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
                 end
                 output[x_out, y_out, out_channel, index_batch] = value
             end
@@ -60,8 +68,10 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4},
 
         @tturbo for index_batch in 1:batch_size
             for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
-                m = y_out + static((y_stride - 1)) * (y_out - 1)
-                n = x_out + static((x_stride - 1)) * (x_out - 1)
+                # m = y_out + static((y_stride - 1)) * (y_out - 1)
+                # n = x_out + static((x_stride - 1)) * (x_out - 1)
+                m = y_out + (y_stride - 1) * (y_out - 1)
+                n = x_out + (x_stride - 1) * (x_out - 1)
                 value = zero(T)
                 for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
                     # y_in = m + (y_w - 1) * y_dilation
@@ -77,8 +87,10 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4},
 
         @tturbo for index_batch in 1:batch_size
             for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in 1:output_height, x_out in 1:output_width
-                m = y_out + static((y_stride - 1)) * (y_out - 1)
-                n = x_out + static((x_stride - 1)) * (x_out - 1)
+                # m = y_out + static((y_stride - 1)) * (y_out - 1)
+                # n = x_out + static((x_stride - 1)) * (x_out - 1)
+                m = y_out + (y_stride - 1) * (y_out - 1)
+                n = x_out + (x_stride - 1) * (x_out - 1)
                 out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
                 value = zero(T)
                 for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
@@ -96,7 +108,8 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4},
     return output
 end
 
-#= # pad bounds check
+#=
+# pad bounds check
 function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
 
     # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise)
@@ -146,8 +159,10 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4},
 
         @tturbo for index_batch in 1:batch_size
             for out_channel in 1:out_channels, y_out in h_region, x_out in w_region
-                m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
-                n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
+                # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
+                # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
+                m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1
+                n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1
                 value = zero(T)
                 for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
                     # y_in = m + (y_w - 1)
@@ -163,8 +178,10 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4},
 
         @tturbo for index_batch in 1:batch_size
             for out_channel in 1:out_channels, y_out in h_region, x_out in w_region
-                m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
-                n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
+                # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
+                # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
+                m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1
+                n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1
                 value = zero(T)
                 for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
                     # y_in = m + (y_w - 1) * y_dilation
@@ -180,8 +197,10 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4},
 
         @tturbo for index_batch in 1:batch_size
             for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in h_region, x_out in w_region
-                m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
-                n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
+                # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
+                # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
+                m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1
+                n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1
                 out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
                 value = zero(T)
                 for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
@@ -209,10 +228,13 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4},
                 
                     @tturbo for index_batch in 1:batch_size # @turbo
                         for out_channel in 1:out_channels, y_out in h_region, x_out in w_region
-                            m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
-                            n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
+                            # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
+                            # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
+                            m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1
+                            n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1
                             value = zero(T)
-                            for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                            # for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                            for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
 
                                 y_in = m + (y_w - 1) * y_dilation
                                 x_in = n + (x_w - 1) * x_dilation
@@ -234,11 +256,14 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4},
 
                     @tturbo for index_batch in 1:batch_size # @turbo
                         for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in h_region, x_out in w_region
-                            m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
-                            n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
+                            # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
+                            # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
+                            m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1
+                            n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1
                             out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
                             value = zero(T)
-                            for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                            # for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                            for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
                                 
                                 y_in = m + (y_w - 1) * y_dilation
                                 x_in = n + (x_w - 1) * x_dilation
@@ -349,6 +374,18 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4},
 end
 =#
 
+function conv_im2col_grouped!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real}
+
+    conv!(
+        NNlib.insert_singleton_spatial_dimension(output, 1),
+        NNlib.insert_singleton_spatial_dimension(input, 1),
+        NNlib.insert_singleton_spatial_dimension(weight, 1),
+        NNlib.insert_singleton_spatial_dimension(cdims, 1)
+    )
+
+    return output
+end
+
 function ∇conv_data_im2col_grouped!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real}
 
     ∇conv_data!(
@@ -415,7 +452,8 @@ function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{
             for x_in in 0:input_width-1, y_in in 0:input_height-1, in_channel in 0:in_channels-1
 
                 value = zero(T)
-                for x_w in static(0):static(weight_width-1), y_w in static(0):static(weight_height-1), out_channel in static(0):static(out_channels-1)
+                # for x_w in static(0):static(weight_width-1), y_w in static(0):static(weight_height-1), out_channel in static(0):static(out_channels-1) # RICHTIG!!
+                for x_w in 0:weight_width-1, y_w in 0:weight_height-1, out_channel in 0:out_channels-1
 
                     is_in_bound_x = (x_in - x_w >= 0) & (x_in - x_w < output_width)
                     is_in_bound_y = (y_in - y_w >= 0) & (y_in - y_w < output_height)
@@ -441,4 +479,86 @@ function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{
     end        
 
     return input_gradient
-end
\ No newline at end of file
+end
+
+#=
+function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
+    # println("myconv data back called")
+    
+    size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
+    cdims_check_dims = DenseConvDims(size(input_gradient), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
+    NNlib.check_dims(size(input_gradient), size_weight_check_dims, size(output_gradient), cdims_check_dims)
+    
+    # storing all the necessary shapes
+    output_width, output_height, out_channels, current_batch_size = size(output_gradient)
+    weight_width, weight_height, in_channels_weight, out_channels = size(weight)
+    # because in the actual computation section, values are added, it's saver to reset the given input_gradient first
+    input_gradient .= zero(T)
+    # check if input_gradient must be padded 
+    if cdims.padding != (0, 0, 0, 0)
+        input_gradient_padded = NNlib.pad_zeros(input_gradient, cdims.padding, dims=(1, 2))
+    else
+        input_gradient_padded = input_gradient
+    end
+    # store the size of input after padding 
+    input_width, input_height, in_channels, current_batch_size = size(input_gradient_padded) # size after padding
+
+    if !NNlib.flipkernel(cdims)
+        weight = reverse(weight, dims=(1, 2))
+    end
+
+    groups = cdims.groupcount
+    x_stride, y_stride = cdims.stride
+    x_dilation, y_dilation = cdims.dilation
+    out_channels_per_group = out_channels ÷ groups
+    # actual computation
+    if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
+        # println("very specialized case for maximum performance")
+        Threads.@threads for index_batch in 1:current_batch_size
+            @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
+                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                    input_gradient_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
+                end
+            end
+        end
+    elseif groups == 1 # second specialized case for better performance
+        # println("second specialized case for better performance")
+        Threads.@threads for index_batch in 1:current_batch_size
+            @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
+                m = y_out + (y_stride - 1) * (y_out - 1)
+                n = x_out + (x_stride - 1) * (x_out - 1)
+                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
+                    y_in = m + (y_w - 1) * y_dilation
+                    x_in = n + (x_w - 1) * x_dilation
+                    input_gradient_padded[x_in, y_in, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
+                end
+            end
+        end
+    else # general case for any convolution 
+        # println("general case for any convolution")
+        Threads.@threads for index_batch in 1:current_batch_size
+            for out_channel_per_group in 1:out_channels_per_group # putting @turbo here may end up in wrong results
+                @turbo for group in 1:groups, y_out in 1:output_height, x_out in 1:output_width
+                    m = y_out + (y_stride - 1) * (y_out - 1)
+                    n = x_out + (x_stride - 1) * (x_out - 1)
+                    out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
+                    for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width
+                        y_in = m + (y_w - 1) * y_dilation
+                        x_in = n + (x_w - 1) * x_dilation
+                        in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
+                        input_gradient_padded[x_in, y_in, in_channel_input, index_batch] += weight[x_w, y_w, in_channel_weight, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
+                    end
+                end
+            end
+        end
+    end
+
+    # depad 
+    if cdims.padding != (0, 0, 0, 0)
+        x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding
+        input_gradient .= input_gradient_padded[x_pad1+1:input_width-x_pad2, y_pad1+1:input_height-y_pad2, :, :]
+    end
+
+    return input_gradient
+end
+=#
\ No newline at end of file

From 82abca83149c8824ba4eca1400076c3074212aec Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Sat, 7 Oct 2023 23:09:26 +0200
Subject: [PATCH 30/37] Add files via upload

---
 bench_torch.py               | 32 +++++++++++++++
 benchmark.jl                 | 76 ++++++++++++++++++++++++++++++++++++
 benchmark_result_julia.csv   |  8 ++++
 benchmark_result_pytorch.csv |  8 ++++
 benchmark_torch.py           | 64 ++++++++++++++++++++++++++++++
 5 files changed, 188 insertions(+)
 create mode 100644 bench_torch.py
 create mode 100644 benchmark.jl
 create mode 100644 benchmark_result_julia.csv
 create mode 100644 benchmark_result_pytorch.csv
 create mode 100644 benchmark_torch.py

diff --git a/bench_torch.py b/bench_torch.py
new file mode 100644
index 000000000..1a97dc673
--- /dev/null
+++ b/bench_torch.py
@@ -0,0 +1,32 @@
+import torch
+import torchvision.models as models
+from torch.profiler import profile, record_function, ProfilerActivity
+import time
+
+model = models.efficientnet_v2_m()
+model.eval()
+
+b_size = 1
+img = torch.rand(b_size, 3, 224, 224)
+
+with profile(activities=[ProfilerActivity.CPU], record_shapes=True, profile_memory=True) as prof:
+    with record_function("model_inference"):
+        pred = model(img)
+    """
+    with record_function("model_backward"):
+        loss = torch.sum(pred - 0.5) # dummy loss
+        loss.backward()
+    """
+
+print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=-1))
+# print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=-1))
+
+start1 = time.perf_counter()
+pred = model(img)
+start2 = time.perf_counter()
+loss = torch.sum(pred - 0.5) # dummy loss
+loss.backward()
+end = time.perf_counter()
+print(f"Time used inference: {start2 - start1} seconds")
+print(f"Time used backward: {end - start2} seconds")
+print(f"Time used inference and backward: {end - start1} seconds")
\ No newline at end of file
diff --git a/benchmark.jl b/benchmark.jl
new file mode 100644
index 000000000..83262f6e0
--- /dev/null
+++ b/benchmark.jl
@@ -0,0 +1,76 @@
+using NNlib, Flux, Metalhead
+using BenchmarkTools
+using DataFrames, CSV
+
+forward(model, input) = model(input)
+
+dummy_loss(output) = sum(output .- 1)
+
+function train_step(model, input)
+    ∇model, ∇input = gradient(model, input) do m, x
+        dummy_loss(m(x))
+    end
+    return ∇model, ∇input
+end
+
+function benchmark(models, dtype, batch_sizes, channels, spatial_size)
+    model_names = sort(collect(keys(models))) # make sure the models are always in the same order
+    forward_times = zeros(length(model_names), length(batch_sizes))
+    train_step_times = zeros(length(model_names), length(batch_sizes))
+
+    for (i, model_name) in enumerate(model_names)
+        println("Benchmarking $model_name...")
+        for (j, batch_size) in enumerate(batch_sizes)
+    
+            input = rand(dtype, spatial_size..., channels, batch_size)
+            model = models[model_name]
+    
+            forward(model, input) # compilation
+            train_step(model, input) # compilation
+
+            forward_times[i, j] = @belapsed forward($model, $input) # @elapsed
+            train_step_times[i, j] = @belapsed train_step($model, $input) # @elapsed
+
+        end
+    end
+
+    return forward_times, train_step_times
+end
+
+# models which should be benchmarked
+models = Dict(
+    "ResNet18" => ResNet(18),
+    "WideResNet50" => WideResNet(50),
+    "DenseNet121" => DenseNet(121),
+    "EfficientNet" => EfficientNet(:b0),
+    "EfficientNetv2" => EfficientNetv2(:small),
+    "MobileNetv3" => MobileNetv3(:small),
+    # "GoogLeNet" => GoogLeNet(),
+    "ConvNeXt" => ConvNeXt(:tiny),
+)
+
+# the data type and batch sizes which should be benchmarked
+dtype = Float32
+batch_sizes = (1, 32)
+# size information (e.g. ImageNet-like images)
+channels = 3
+spatial_size = (224, 224) # WH
+
+forward_times1, train_step_times1 = benchmark(models, dtype, batch_sizes, channels, spatial_size)
+using LoopVectorization # load LoopVectorization here to load the lv-extension
+forward_times2, train_step_times2 = benchmark(models, dtype, batch_sizes, channels, spatial_size)
+
+df = DataFrame()
+df[!, "model_names"] = sort(collect(keys(models))) # make sure the models are always in the same order
+
+for (i, batch_size) in enumerate(batch_sizes)
+    df[!, "acceleration inference, batch_size: $batch_size"] = forward_times1[:, i] ./ forward_times2[:, i]
+    df[!, "acceleration train, batch_size: $batch_size"] = train_step_times1[:, i] ./ train_step_times2[:, i]
+
+    df[!, "im2col, inference, batch_size: $batch_size"] = forward_times1[:, i]
+    df[!, "lv-ext, inference, batch_size: $batch_size"] = forward_times2[:, i]
+    df[!, "im2col, train, batch_size: $batch_size"] = train_step_times1[:, i]
+    df[!, "lv-ext, train, batch_size: $batch_size"] = train_step_times2[:, i]
+end
+
+CSV.write("benchmark_result_julia.csv", df)
\ No newline at end of file
diff --git a/benchmark_result_julia.csv b/benchmark_result_julia.csv
new file mode 100644
index 000000000..13c60f90d
--- /dev/null
+++ b/benchmark_result_julia.csv
@@ -0,0 +1,8 @@
+model_names,"acceleration inference, batch_size: 1","acceleration train, batch_size: 1","im2col, inference, batch_size: 1","lv-ext, inference, batch_size: 1","im2col, train, batch_size: 1","lv-ext, train, batch_size: 1","acceleration inference, batch_size: 32","acceleration train, batch_size: 32","im2col, inference, batch_size: 32","lv-ext, inference, batch_size: 32","im2col, train, batch_size: 32","lv-ext, train, batch_size: 32"
+ConvNeXt,2.696525688337994,0.8577982223471639,0.3655661,0.1355693,1.0179392,1.1866884,1.1012307960913856,0.9863925898269857,4.8062238,4.364411,14.39549,14.5940776
+DenseNet121,2.8334019239672825,1.5740008937784382,0.1762512,0.0622048,0.7033689,0.4468669,1.263441740346075,0.9985112159016455,2.8772053,2.2772758,13.7427543,13.7632448
+EfficientNet,2.931731906635844,1.3684833239947383,0.1895995,0.0646715,0.7397858,0.5405881,1.0664467750727167,1.1269442586755343,2.4186004,2.2679054,8.6440818,7.6703721
+EfficientNetv2,5.053202171229253,5.590555646664826,0.6446749,0.1275775,12.9539883,2.31712,1.4478919823022125,1.224915348029691,6.3059734,4.3552789,22.0001035,17.9605093
+MobileNetv3,6.416791193464834,1.0744301651818315,0.0515294,0.0080304,0.1731701,0.1611739,1.2257825196180756,1.0673281389045863,0.3963721,0.3233625,1.6299691,1.527149
+ResNet18,1.2245235336423206,1.1195501549857,0.0490358,0.0400448,0.1998762,0.1785326,1.0889384258792703,0.9128196230532792,0.8104152,0.744225,3.1845247,3.4886681
+WideResNet50,0.670115626920713,0.7518492599095661,0.1744445,0.26032,0.7198077,0.957383,0.9693667932236841,0.8498274823283908,4.0205619,4.1476167,15.3664104,18.0817998
diff --git a/benchmark_result_pytorch.csv b/benchmark_result_pytorch.csv
new file mode 100644
index 000000000..5f929352b
--- /dev/null
+++ b/benchmark_result_pytorch.csv
@@ -0,0 +1,8 @@
+,model_names,"inference, batch_size: 1","train, batch_size: 1","inference, batch_size: 32","train, batch_size: 32"
+0,ConvNeXt,0.0805797,0.2446732,1.598144,4.723875
+1,DenseNet121,0.1164808,0.2522871,1.5435163,4.5821567
+2,EfficientNet,0.0598909,0.1230739,0.9899088,2.6415434
+3,EfficientNetv2,0.13141,0.2856547,1.8736856,5.1560116
+4,MobileNetv3,0.0220434,0.0392312,0.2386686,0.5895018
+5,ResNet18,0.0289765,0.0681748,0.4151833,1.2125939
+6,WideResNet50,0.1647507,0.4650428,2.3093054,6.9086385
diff --git a/benchmark_torch.py b/benchmark_torch.py
new file mode 100644
index 000000000..55705ad8b
--- /dev/null
+++ b/benchmark_torch.py
@@ -0,0 +1,64 @@
+import torch
+import torchvision.models as visionmodels
+import time
+import pandas as pd
+
+def dummy_loss(output):
+    return torch.sum(output - 1)
+
+def train_step(model, input_to_model):
+    output = model(input_to_model)
+    loss = dummy_loss(output)
+    loss.backward()
+
+def benchmark(models, batch_sizes, channels, spatial_size):
+    model_names = sorted(list(models.keys())) # make sure the models are always in the same order
+    forward_times = torch.zeros(len(model_names), len(batch_sizes))
+    train_step_times = torch.zeros(len(model_names), len(batch_sizes))
+
+    for i, model_name in enumerate(model_names):
+        print(f"Benchmarking {model_name}...")
+        for j, batch_size in enumerate(batch_sizes):
+    
+            input_to_model = torch.rand(batch_size, channels, spatial_size[0], spatial_size[1])
+            model = models[model_name]
+    
+            time_start = time.perf_counter()
+            model(input_to_model)
+            time_duration = time.perf_counter() - time_start
+            forward_times[i, j] = time_duration
+
+            time_start = time.perf_counter()
+            train_step(model, input_to_model)
+            time_duration = time.perf_counter() - time_start
+            train_step_times[i, j] = time_duration
+
+    return forward_times, train_step_times
+
+models = {
+    "ResNet18" : visionmodels.resnet18(),
+    "WideResNet50" : visionmodels.wide_resnet50_2(),
+    "DenseNet121" : visionmodels.densenet121(),
+    "EfficientNet" : visionmodels.efficientnet_b0(),
+    "EfficientNetv2" : visionmodels.efficientnet_v2_s(),
+    "MobileNetv3" : visionmodels.mobilenet_v3_small(),
+    # "GoogLeNet" : visionmodels.googlenet(),
+    "ConvNeXt" : visionmodels.convnext_tiny(),
+}
+
+# the batch sizes which should be benchmarked
+batch_sizes = (1, 32)
+# size information (e.g. ImageNet-like images)
+channels = 3
+spatial_size = (224, 224) # HW
+
+forward_times, train_step_times = benchmark(models, batch_sizes, channels, spatial_size)
+
+df = pd.DataFrame()
+df["model_names"] = sorted(list(models.keys())) # make sure the models are always in the same order
+
+for (i, batch_size) in enumerate(batch_sizes):
+    df[f"inference, batch_size: {batch_size}"] = forward_times[:, i]
+    df[f"train, batch_size: {batch_size}"] = train_step_times[:, i]
+
+df.to_csv("benchmark_result_pytorch.csv")
\ No newline at end of file

From c5ec7138fbd0e232dfa59482883ede2c44a2bef8 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Sun, 8 Oct 2023 19:32:28 +0200
Subject: [PATCH 31/37] Delete bench_torch.py

---
 bench_torch.py | 32 --------------------------------
 1 file changed, 32 deletions(-)
 delete mode 100644 bench_torch.py

diff --git a/bench_torch.py b/bench_torch.py
deleted file mode 100644
index 1a97dc673..000000000
--- a/bench_torch.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import torch
-import torchvision.models as models
-from torch.profiler import profile, record_function, ProfilerActivity
-import time
-
-model = models.efficientnet_v2_m()
-model.eval()
-
-b_size = 1
-img = torch.rand(b_size, 3, 224, 224)
-
-with profile(activities=[ProfilerActivity.CPU], record_shapes=True, profile_memory=True) as prof:
-    with record_function("model_inference"):
-        pred = model(img)
-    """
-    with record_function("model_backward"):
-        loss = torch.sum(pred - 0.5) # dummy loss
-        loss.backward()
-    """
-
-print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=-1))
-# print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=-1))
-
-start1 = time.perf_counter()
-pred = model(img)
-start2 = time.perf_counter()
-loss = torch.sum(pred - 0.5) # dummy loss
-loss.backward()
-end = time.perf_counter()
-print(f"Time used inference: {start2 - start1} seconds")
-print(f"Time used backward: {end - start2} seconds")
-print(f"Time used inference and backward: {end - start1} seconds")
\ No newline at end of file

From 3f1c6dc6adbc20b5653efa1a39b61d475ec1e4f0 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Sun, 8 Oct 2023 19:34:39 +0200
Subject: [PATCH 32/37] Add files via upload

---
 bench_torch.py               | 32 ++++++++++++++++++++++++++++++++
 benchmark.jl                 | 12 +++++++++---
 benchmark_result_julia.csv   | 14 +++++++-------
 benchmark_result_pytorch.csv | 14 +++++++-------
 4 files changed, 55 insertions(+), 17 deletions(-)
 create mode 100644 bench_torch.py

diff --git a/bench_torch.py b/bench_torch.py
new file mode 100644
index 000000000..1a97dc673
--- /dev/null
+++ b/bench_torch.py
@@ -0,0 +1,32 @@
+import torch
+import torchvision.models as models
+from torch.profiler import profile, record_function, ProfilerActivity
+import time
+
+model = models.efficientnet_v2_m()
+model.eval()
+
+b_size = 1
+img = torch.rand(b_size, 3, 224, 224)
+
+with profile(activities=[ProfilerActivity.CPU], record_shapes=True, profile_memory=True) as prof:
+    with record_function("model_inference"):
+        pred = model(img)
+    """
+    with record_function("model_backward"):
+        loss = torch.sum(pred - 0.5) # dummy loss
+        loss.backward()
+    """
+
+print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=-1))
+# print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=-1))
+
+start1 = time.perf_counter()
+pred = model(img)
+start2 = time.perf_counter()
+loss = torch.sum(pred - 0.5) # dummy loss
+loss.backward()
+end = time.perf_counter()
+print(f"Time used inference: {start2 - start1} seconds")
+print(f"Time used backward: {end - start2} seconds")
+print(f"Time used inference and backward: {end - start1} seconds")
\ No newline at end of file
diff --git a/benchmark.jl b/benchmark.jl
index 83262f6e0..072144d51 100644
--- a/benchmark.jl
+++ b/benchmark.jl
@@ -1,5 +1,5 @@
 using NNlib, Flux, Metalhead
-using BenchmarkTools
+using BenchmarkTools, Statistics
 using DataFrames, CSV
 
 forward(model, input) = model(input)
@@ -28,8 +28,14 @@ function benchmark(models, dtype, batch_sizes, channels, spatial_size)
             forward(model, input) # compilation
             train_step(model, input) # compilation
 
-            forward_times[i, j] = @belapsed forward($model, $input) # @elapsed
-            train_step_times[i, j] = @belapsed train_step($model, $input) # @elapsed
+            # using @belapsed (minimum time)
+            #=
+            forward_times[i, j] = @belapsed forward($model, $input)
+            train_step_times[i, j] = @belapsed train_step($model, $input)
+            =#
+            # using median time
+            forward_times[i, j] = median(@benchmark forward($model, $input)).time / 10^9
+            train_step_times[i, j] = median(@benchmark train_step($model, $input)).time / 10^9
 
         end
     end
diff --git a/benchmark_result_julia.csv b/benchmark_result_julia.csv
index 13c60f90d..6ca132f8e 100644
--- a/benchmark_result_julia.csv
+++ b/benchmark_result_julia.csv
@@ -1,8 +1,8 @@
 model_names,"acceleration inference, batch_size: 1","acceleration train, batch_size: 1","im2col, inference, batch_size: 1","lv-ext, inference, batch_size: 1","im2col, train, batch_size: 1","lv-ext, train, batch_size: 1","acceleration inference, batch_size: 32","acceleration train, batch_size: 32","im2col, inference, batch_size: 32","lv-ext, inference, batch_size: 32","im2col, train, batch_size: 32","lv-ext, train, batch_size: 32"
-ConvNeXt,2.696525688337994,0.8577982223471639,0.3655661,0.1355693,1.0179392,1.1866884,1.1012307960913856,0.9863925898269857,4.8062238,4.364411,14.39549,14.5940776
-DenseNet121,2.8334019239672825,1.5740008937784382,0.1762512,0.0622048,0.7033689,0.4468669,1.263441740346075,0.9985112159016455,2.8772053,2.2772758,13.7427543,13.7632448
-EfficientNet,2.931731906635844,1.3684833239947383,0.1895995,0.0646715,0.7397858,0.5405881,1.0664467750727167,1.1269442586755343,2.4186004,2.2679054,8.6440818,7.6703721
-EfficientNetv2,5.053202171229253,5.590555646664826,0.6446749,0.1275775,12.9539883,2.31712,1.4478919823022125,1.224915348029691,6.3059734,4.3552789,22.0001035,17.9605093
-MobileNetv3,6.416791193464834,1.0744301651818315,0.0515294,0.0080304,0.1731701,0.1611739,1.2257825196180756,1.0673281389045863,0.3963721,0.3233625,1.6299691,1.527149
-ResNet18,1.2245235336423206,1.1195501549857,0.0490358,0.0400448,0.1998762,0.1785326,1.0889384258792703,0.9128196230532792,0.8104152,0.744225,3.1845247,3.4886681
-WideResNet50,0.670115626920713,0.7518492599095661,0.1744445,0.26032,0.7198077,0.957383,0.9693667932236841,0.8498274823283908,4.0205619,4.1476167,15.3664104,18.0817998
+ConvNeXt,3.0131868428607564,1.3994729097036838,0.4240265,0.1407236,1.43802405,1.0275469,1.1620754379865017,1.0632876498150545,4.6366846,3.9900031,14.7683656,13.8893418
+DenseNet121,2.7011062104755816,1.575624841888005,0.1855009,0.0686759,0.7069096,0.4486535,1.2534693725910626,1.036169100624124,2.6923755,2.1479388,12.6862194,12.2433871
+EfficientNet,6.669989485006747,2.4963127103581892,0.49731575,0.0745602,1.33507035,0.53481695,1.121879889261884,1.1298641673853496,2.5537233,2.2762894,8.1940817,7.2522715
+EfficientNetv2,16.28186773870062,7.202334907846903,2.5620854,0.1573582,12.053267,1.67352215,1.4558721609174592,1.203183521905458,6.1329556,4.21256465,21.0444893,17.4906728
+MobileNetv3,12.105678302652656,1.5684538123069776,0.1103481,0.0091154,0.31291775,0.19950715,1.2884028351358188,1.1391237206595466,0.43895395,0.3406962,2.0458146,1.7959547
+ResNet18,1.321074637025202,1.0621579200481972,0.0558948,0.0423101,0.2110332,0.19868345,1.0855325609238786,0.8862720054297211,0.8071219,0.7435262,2.98925695,3.3728437
+WideResNet50,0.6797203960326701,0.7846926795922912,0.1863516,0.2741592,0.6916193,0.88138875,0.8693605563082452,0.7977827693085691,3.68315245,4.23662245,13.5918181,17.0369913
diff --git a/benchmark_result_pytorch.csv b/benchmark_result_pytorch.csv
index 5f929352b..153e03686 100644
--- a/benchmark_result_pytorch.csv
+++ b/benchmark_result_pytorch.csv
@@ -1,8 +1,8 @@
 ,model_names,"inference, batch_size: 1","train, batch_size: 1","inference, batch_size: 32","train, batch_size: 32"
-0,ConvNeXt,0.0805797,0.2446732,1.598144,4.723875
-1,DenseNet121,0.1164808,0.2522871,1.5435163,4.5821567
-2,EfficientNet,0.0598909,0.1230739,0.9899088,2.6415434
-3,EfficientNetv2,0.13141,0.2856547,1.8736856,5.1560116
-4,MobileNetv3,0.0220434,0.0392312,0.2386686,0.5895018
-5,ResNet18,0.0289765,0.0681748,0.4151833,1.2125939
-6,WideResNet50,0.1647507,0.4650428,2.3093054,6.9086385
+0,ConvNeXt,0.0838077,0.3047408,1.6291157,4.6741033
+1,DenseNet121,0.108188,0.2309508,1.5926049,4.6524887
+2,EfficientNet,0.0661599,0.1544869,0.9558551,2.707274
+3,EfficientNetv2,0.129968,0.2956851,1.830179,5.1837134
+4,MobileNetv3,0.022812,0.0398323,0.2456348,0.5598582
+5,ResNet18,0.0305176,0.074905,0.4218851,1.192504
+6,WideResNet50,0.1575364,0.4871242,2.2386081,6.8065705

From 8dde5f98b133f6cfb6bc1da3909c96200b4ae303 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Sun, 8 Oct 2023 19:34:52 +0200
Subject: [PATCH 33/37] Add files via upload

---
 Project.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index fe9314f97..14cee5b4f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -42,7 +42,6 @@ KernelAbstractions = "0.9.2"
 Requires = "1.0"
 cuDNN = "1"
 julia = "1.9"
-# LoopVectorization = "=0.12.146"
 
 [extras]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"

From 55051570e35db58c2e5a1f344c44807b3db6c728 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Sun, 8 Oct 2023 19:35:15 +0200
Subject: [PATCH 34/37] Add files via upload

---
 .../NNlibLoopVectorizationExt.jl              |   5 +-
 ext/NNlibLoopVectorizationExt/activations.jl  |  26 ++
 ext/NNlibLoopVectorizationExt/conv.jl         | 421 +-----------------
 ext/NNlibLoopVectorizationExt/pooling.jl      |  63 +--
 4 files changed, 83 insertions(+), 432 deletions(-)
 create mode 100644 ext/NNlibLoopVectorizationExt/activations.jl

diff --git a/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl b/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl
index 3be6c28a2..6e4b8d8e0 100644
--- a/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl
+++ b/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl
@@ -5,11 +5,8 @@ using LoopVectorization
 using Random, Statistics
 using OffsetArrays, Static
 
-# Bevor die Methoden überladen werden, sollte ein Selfcheck durchgeführt werden, ob die Ergebnisse mit NNlib übereinstimmen.
-# Wenn nicht, sollte eine Warnung angezeigt werden und die wenn irgendwie möglich, nur die funktionierenden Methoden verwendet werden.
-# Z.b. in dem bei falschem Ergebniss, die im2col Methode als Backend in der überladenen Methode aufgerufen wird.
-
 include("conv.jl")
 include("pooling.jl")
+include("activations.jl")
 
 end # module
\ No newline at end of file
diff --git a/ext/NNlibLoopVectorizationExt/activations.jl b/ext/NNlibLoopVectorizationExt/activations.jl
new file mode 100644
index 000000000..8201e7299
--- /dev/null
+++ b/ext/NNlibLoopVectorizationExt/activations.jl
@@ -0,0 +1,26 @@
+_tanh(x) = tanh(x)
+Base.broadcasted(::typeof(tanh), x::AbstractArray) = @turbo _tanh.(x)
+
+_softsign(x) = x / (1 + abs(x))
+Base.broadcasted(::typeof(NNlib.softsign), x::AbstractArray) = @turbo _softsign.(x)
+
+_softplus(x) = log1p(exp(-abs(x)))
+Base.broadcasted(::typeof(NNlib.softplus), x::AbstractArray) = (@turbo _softplus.(x)) .+ NNlib.relu.(x)
+
+function _sigmoid(x)
+    t = exp(-abs(x))
+    ifelse(x ≥ 0, inv(1 + t), t / (1 + t))
+end
+Base.broadcasted(::typeof(NNlib.sigmoid), x::AbstractArray) = @turbo _sigmoid.(x)
+Base.broadcasted(::typeof(NNlib.sigmoid_fast), x::AbstractArray) = @turbo _sigmoid.(x) # don't do the same for tanh_fast, it would be slower
+
+function _hardsigmoid(x)
+    clamp((x + 3) / 6, 0, 1)
+end
+Base.broadcasted(::typeof(NNlib.hardsigmoid), x::AbstractArray) = @turbo _hardsigmoid.(x)
+
+_logsigmoid(x) = -_softplus(-x)
+Base.broadcasted(::typeof(NNlib.logsigmoid), x::AbstractArray) = @turbo _logsigmoid.(x)
+
+_swish(x) = x * _sigmoid(x)
+Base.broadcasted(::typeof(NNlib.swish), x::AbstractArray) = @turbo _swish.(x)
\ No newline at end of file
diff --git a/ext/NNlibLoopVectorizationExt/conv.jl b/ext/NNlibLoopVectorizationExt/conv.jl
index a75026b82..0ca75f466 100644
--- a/ext/NNlibLoopVectorizationExt/conv.jl
+++ b/ext/NNlibLoopVectorizationExt/conv.jl
@@ -1,9 +1,19 @@
-# pad naivly, static iters
-function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real}
+#=
+Accelerated convolution for 2d-images using the power of LoopVectorization.
+The acceleration is usually greatest when the inputs have a large spatial size and few channels. 
+Using stride > 1, dilation > 1 or groups > 1 can slow down things a bit.
+
+Since the current state of LoopVectorization ∇conv_filter! isn't really faster than the 
+original implementation in some situations, it is left out for the moment.
+
+Implementation for forward pass mostly copied from here (Jonas Steinebach, MIT license):
+https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_convolution.jl
 
-    # if cdims.stride != (1, 1)
-        # return conv_im2col_grouped!(output, input, weight, cdims)
-    # end
+Implementation for backward pass mostly copied from here (Chris Elrod, MIT license):
+https://github.com/PumasAI/SimpleChains.jl/blob/main/src/conv.jl
+=#
+
+function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real}
 
     # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise)
     size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
@@ -48,15 +58,11 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4},
 
         @tturbo for index_batch in 1:batch_size
             for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
-                # m = y_out + static((y_stride - 1)) * (y_out - 1)
-                # n = x_out + static((x_stride - 1)) * (x_out - 1)
                 m = y_out + (y_stride - 1) * (y_out - 1)
                 n = x_out + (x_stride - 1) * (x_out - 1)
                 value = zero(T)
-                for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
-                    # y_in = m + (y_w - 1)
-                    # x_in = n + (x_w - 1)
-                    # value += input[n + (x_w - 1), m + (y_w - 1), in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
+                # for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
                     value += input[n + x_w - 1, m + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
                 end
                 output[x_out, y_out, out_channel, index_batch] = value
@@ -68,14 +74,11 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4},
 
         @tturbo for index_batch in 1:batch_size
             for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
-                # m = y_out + static((y_stride - 1)) * (y_out - 1)
-                # n = x_out + static((x_stride - 1)) * (x_out - 1)
                 m = y_out + (y_stride - 1) * (y_out - 1)
                 n = x_out + (x_stride - 1) * (x_out - 1)
                 value = zero(T)
-                for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
-                    # y_in = m + (y_w - 1) * y_dilation
-                    # x_in = n + (x_w - 1) * x_dilation
+                # for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
+                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
                     value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
                 end
                 output[x_out, y_out, out_channel, index_batch] = value
@@ -86,283 +89,14 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4},
         # println("forward: general case for any convolution")
 
         @tturbo for index_batch in 1:batch_size
-            for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in 1:output_height, x_out in 1:output_width
-                # m = y_out + static((y_stride - 1)) * (y_out - 1)
-                # n = x_out + static((x_stride - 1)) * (x_out - 1)
-                m = y_out + (y_stride - 1) * (y_out - 1)
-                n = x_out + (x_stride - 1) * (x_out - 1)
-                out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
-                value = zero(T)
-                for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
-                    # y_in = m + (y_w - 1) * y_dilation
-                    # x_in = n + (x_w - 1) * x_dilation
-                    in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
-                    value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel]
-                end
-                output[x_out, y_out, out_channel, index_batch] = value
-            end
-        end
-
-    end
-
-    return output
-end
-
-#=
-# pad bounds check
-function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
-
-    # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise)
-    size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
-    cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
-    NNlib.check_dims(size(input), size_weight_check_dims, size(output), cdims_check_dims)
-
-    output_width, output_height, _ = size(output)
-    input_width, input_height, in_channels, batch_size = size(input)
-    weight_width, weight_height, in_channels_weight, out_channels = size(weight)
-
-    # it's necessary to flip the kernel if real convolution is performed (flipkernel=false)
-    if !NNlib.flipkernel(cdims)
-        weight = reverse(weight, dims=(1, 2))
-    end
-
-    groups = cdims.groupcount
-    x_stride, y_stride = cdims.stride
-    x_dilation, y_dilation = cdims.dilation
-    x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding
-    out_channels_per_group = out_channels ÷ groups
-
-    # We use calc_padding_regions to split outselves up into separate regions that may or
-    # may not need to worry about padding:
-    cdims_3d = DenseConvDims((input_width, input_height, 1, in_channels, batch_size), (weight_width, weight_height, 1, in_channels_weight, out_channels), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1))
-    # println(pdims_3d.padding)
-    padded_regions, central_region = NNlib.calc_padding_regions(cdims_3d)
-
-    # Start with the central region
-    w_region, h_region, _ = central_region
-
-    if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
-        # println("forward: very specialized case for maximum performance")
-
-        @tturbo for index_batch in 1:batch_size
-            for out_channel in 1:out_channels, y_out in h_region, x_out in w_region
-                value = zero(T)
-                for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
-                    value += input[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
-                end
-                output[x_out, y_out, out_channel, index_batch] = value
-            end
-        end
-
-    elseif groups == 1 && cdims.dilation == (1, 1) # second specialized case for better performance
-        # println("forward: second specialized case for better performance")
-
-        @tturbo for index_batch in 1:batch_size
-            for out_channel in 1:out_channels, y_out in h_region, x_out in w_region
-                # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
-                # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
-                m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1
-                n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1
-                value = zero(T)
-                for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
-                    # y_in = m + (y_w - 1)
-                    # x_in = n + (x_w - 1)
-                    value += input[n + (x_w - 1), m + (y_w - 1), in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
-                end
-                output[x_out, y_out, out_channel, index_batch] = value
-            end
-        end
-
-    elseif groups == 1 # third specialized case for better performance
-        # println("forward: third specialized case for better performance")
-
-        @tturbo for index_batch in 1:batch_size
-            for out_channel in 1:out_channels, y_out in h_region, x_out in w_region
-                # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
-                # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
-                m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1
-                n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1
-                value = zero(T)
-                for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
-                    # y_in = m + (y_w - 1) * y_dilation
-                    # x_in = n + (x_w - 1) * x_dilation
-                    value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
-                end
-                output[x_out, y_out, out_channel, index_batch] = value
-            end
-        end
-
-    else # general case for any convolution
-        # println("forward: general case for any convolution")
-
-        @tturbo for index_batch in 1:batch_size
-            for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in h_region, x_out in w_region
-                # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
-                # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
-                m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1
-                n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1
-                out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
-                value = zero(T)
-                for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
-                    # y_in = m + (y_w - 1) * y_dilation
-                    # x_in = n + (x_w - 1) * x_dilation
-                    in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
-                    value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel]
-                end
-                output[x_out, y_out, out_channel, index_batch] = value
-            end
-        end
-
-    end
-
-    # @show w_region # 1:0 warning: when padding is unequal and one of x_pad1 or y_pad1 is 0, emty collections are possible
-    # @show h_region # 1:0 if isempty(1:0) -> true
-    # println()
-
-    if cdims.padding != (0, 0, 0, 0)
-        # Next, the padded regions
-        for (w_region, h_region, d_region) in padded_regions # @inbounds 
-            for z_out in d_region # for skipping the d_regions
-
-                if cdims.groupcount == 1
-                
-                    @tturbo for index_batch in 1:batch_size # @turbo
-                        for out_channel in 1:out_channels, y_out in h_region, x_out in w_region
-                            # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
-                            # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
-                            m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1
-                            n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1
-                            value = zero(T)
-                            # for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
-                            for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
-
-                                y_in = m + (y_w - 1) * y_dilation
-                                x_in = n + (x_w - 1) * x_dilation
-
-                                is_in_bound_x = (x_in >= 1) & (x_in <= input_width)
-                                is_in_bound_y = (y_in >= 1) & (y_in <= input_height)
-
-                                input_value = (is_in_bound_x & is_in_bound_y) ? input[x_in, y_in, in_channel, index_batch] : zero(T)
-                                value += input_value * weight[x_w, y_w, in_channel, out_channel]
-
-                                # value += (ib0 & ib1) ? input[x_in, y_in, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] : zero(T)
-
-                            end
-                            output[x_out, y_out, out_channel, index_batch] = value
-                        end
-                    end
-
-                else
-
-                    @tturbo for index_batch in 1:batch_size # @turbo
-                        for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in h_region, x_out in w_region
-                            # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1)
-                            # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1)
-                            m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1
-                            n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1
-                            out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
-                            value = zero(T)
-                            # for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
-                            for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
-                                
-                                y_in = m + (y_w - 1) * y_dilation
-                                x_in = n + (x_w - 1) * x_dilation
-
-                                in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
-
-                                is_in_bound_x = (x_in >= 1) & (x_in <= input_width)
-                                is_in_bound_y = (y_in >= 1) & (y_in <= input_height)
-
-                                input_value = (is_in_bound_x & is_in_bound_y) ? input[x_in, y_in, in_channel_input, index_batch] : zero(T)
-                                value += input_value * weight[x_w, y_w, in_channel_weight, out_channel]
-
-                                # value += (ib0 & ib1) ? input[x_in, y_in, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel] : zero(T)
-
-                            end
-                            output[x_out, y_out, out_channel, index_batch] = value
-                        end
-                    end
-
-                end
-            end
-        end
-    end
-
-    return output
-end
-=#
-
-#=
-function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
-
-    # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise)
-    size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
-    cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
-    NNlib.check_dims(size(input), size_weight_check_dims, size(output), cdims_check_dims)
-
-    # padding is done naively at the moment
-    if cdims.padding != (0, 0, 0, 0)
-        input = NNlib.pad_zeros(input, cdims.padding, dims=(1, 2))
-    end
-
-    output_width, output_height, _ = size(output)
-    input_width, input_height, in_channels, batches = size(input)
-    weight_width, weight_height, in_channels_weight, out_channels = size(weight)
-
-    # it's necessary to flip the kernel if real convolution is performed (flipkernel=false)
-    if !NNlib.flipkernel(cdims)
-        weight = reverse(weight, dims=(1, 2))
-    end
-
-    groups = cdims.groupcount
-    x_stride, y_stride = cdims.stride
-    x_dilation, y_dilation = cdims.dilation
-    out_channels_per_group = out_channels ÷ groups
-
-    if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
-        # println("forward: very specialized case for maximum performance")
-
-        @tturbo for index_batch in 1:batches
-            for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
-                value = zero(T)
-                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
-                    value += input[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
-                end
-                output[x_out, y_out, out_channel, index_batch] = value
-            end
-        end
-
-    elseif groups == 1 # second specialized case for better performance
-        # println("forward: second specialized case for better performance")
-
-        @tturbo for index_batch in 1:batches
-            for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
-                m = y_out + (y_stride - 1) * (y_out - 1)
-                n = x_out + (x_stride - 1) * (x_out - 1)
-                value = zero(T)
-                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
-                    y_in = m + (y_w - 1) * y_dilation
-                    x_in = n + (x_w - 1) * x_dilation
-                    value += input[x_in, y_in, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel]
-                end
-                output[x_out, y_out, out_channel, index_batch] = value
-            end
-        end
-
-    else # general case for any convolution
-        # println("forward: general case for any convolution")
-
-        @tturbo for index_batch in 1:batches
             for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in 1:output_height, x_out in 1:output_width
                 m = y_out + (y_stride - 1) * (y_out - 1)
                 n = x_out + (x_stride - 1) * (x_out - 1)
                 out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
                 value = zero(T)
+                # for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width)
                 for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width
-                    y_in = m + (y_w - 1) * y_dilation
-                    x_in = n + (x_w - 1) * x_dilation
-                    in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
-                    value += input[x_in, y_in, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel]
+                    value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel_weight + (group - 1) * in_channels_weight, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel]
                 end
                 output[x_out, y_out, out_channel, index_batch] = value
             end
@@ -372,19 +106,6 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4},
 
     return output
 end
-=#
-
-function conv_im2col_grouped!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real}
-
-    conv!(
-        NNlib.insert_singleton_spatial_dimension(output, 1),
-        NNlib.insert_singleton_spatial_dimension(input, 1),
-        NNlib.insert_singleton_spatial_dimension(weight, 1),
-        NNlib.insert_singleton_spatial_dimension(cdims, 1)
-    )
-
-    return output
-end
 
 function ∇conv_data_im2col_grouped!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real}
 
@@ -398,18 +119,6 @@ function ∇conv_data_im2col_grouped!(input_gradient::Array{T,4}, output_gradien
     return input_gradient
 end
 
-function ∇conv_filter_im2col_grouped!(weight_gradient::Array{T,4}, input::Array{T,4}, output_gradient::Array{T,4}, cdims::ConvDims) where {T<:Real}
-
-    ∇conv_filter!(
-        NNlib.insert_singleton_spatial_dimension(weight_gradient, 1),
-        NNlib.insert_singleton_spatial_dimension(input, 1),
-        NNlib.insert_singleton_spatial_dimension(output_gradient, 1),
-        NNlib.insert_singleton_spatial_dimension(cdims, 1)
-    )
-
-    return weight_gradient
-end
-
 function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real}
 
     # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise)
@@ -452,8 +161,8 @@ function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{
             for x_in in 0:input_width-1, y_in in 0:input_height-1, in_channel in 0:in_channels-1
 
                 value = zero(T)
-                # for x_w in static(0):static(weight_width-1), y_w in static(0):static(weight_height-1), out_channel in static(0):static(out_channels-1) # RICHTIG!!
-                for x_w in 0:weight_width-1, y_w in 0:weight_height-1, out_channel in 0:out_channels-1
+                for x_w in static(0):static(weight_width-1), y_w in static(0):static(weight_height-1), out_channel in static(0):static(out_channels-1)
+                # for x_w in 0:weight_width-1, y_w in 0:weight_height-1, out_channel in 0:out_channels-1
 
                     is_in_bound_x = (x_in - x_w >= 0) & (x_in - x_w < output_width)
                     is_in_bound_y = (y_in - y_w >= 0) & (y_in - y_w < output_height)
@@ -479,86 +188,4 @@ function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{
     end        
 
     return input_gradient
-end
-
-#=
-function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real}
-    # println("myconv data back called")
-    
-    size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4])
-    cdims_check_dims = DenseConvDims(size(input_gradient), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel)
-    NNlib.check_dims(size(input_gradient), size_weight_check_dims, size(output_gradient), cdims_check_dims)
-    
-    # storing all the necessary shapes
-    output_width, output_height, out_channels, current_batch_size = size(output_gradient)
-    weight_width, weight_height, in_channels_weight, out_channels = size(weight)
-    # because in the actual computation section, values are added, it's saver to reset the given input_gradient first
-    input_gradient .= zero(T)
-    # check if input_gradient must be padded 
-    if cdims.padding != (0, 0, 0, 0)
-        input_gradient_padded = NNlib.pad_zeros(input_gradient, cdims.padding, dims=(1, 2))
-    else
-        input_gradient_padded = input_gradient
-    end
-    # store the size of input after padding 
-    input_width, input_height, in_channels, current_batch_size = size(input_gradient_padded) # size after padding
-
-    if !NNlib.flipkernel(cdims)
-        weight = reverse(weight, dims=(1, 2))
-    end
-
-    groups = cdims.groupcount
-    x_stride, y_stride = cdims.stride
-    x_dilation, y_dilation = cdims.dilation
-    out_channels_per_group = out_channels ÷ groups
-    # actual computation
-    if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance
-        # println("very specialized case for maximum performance")
-        Threads.@threads for index_batch in 1:current_batch_size
-            @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
-                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
-                    input_gradient_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
-                end
-            end
-        end
-    elseif groups == 1 # second specialized case for better performance
-        # println("second specialized case for better performance")
-        Threads.@threads for index_batch in 1:current_batch_size
-            @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width
-                m = y_out + (y_stride - 1) * (y_out - 1)
-                n = x_out + (x_stride - 1) * (x_out - 1)
-                for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width
-                    y_in = m + (y_w - 1) * y_dilation
-                    x_in = n + (x_w - 1) * x_dilation
-                    input_gradient_padded[x_in, y_in, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
-                end
-            end
-        end
-    else # general case for any convolution 
-        # println("general case for any convolution")
-        Threads.@threads for index_batch in 1:current_batch_size
-            for out_channel_per_group in 1:out_channels_per_group # putting @turbo here may end up in wrong results
-                @turbo for group in 1:groups, y_out in 1:output_height, x_out in 1:output_width
-                    m = y_out + (y_stride - 1) * (y_out - 1)
-                    n = x_out + (x_stride - 1) * (x_out - 1)
-                    out_channel = (group * out_channels_per_group + 1) - out_channel_per_group
-                    for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width
-                        y_in = m + (y_w - 1) * y_dilation
-                        x_in = n + (x_w - 1) * x_dilation
-                        in_channel_input = in_channel_weight + (group - 1) * in_channels_weight
-                        input_gradient_padded[x_in, y_in, in_channel_input, index_batch] += weight[x_w, y_w, in_channel_weight, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch]
-                    end
-                end
-            end
-        end
-    end
-
-    # depad 
-    if cdims.padding != (0, 0, 0, 0)
-        x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding
-        input_gradient .= input_gradient_padded[x_pad1+1:input_width-x_pad2, y_pad1+1:input_height-y_pad2, :, :]
-    end
-
-    return input_gradient
-end
-=#
\ No newline at end of file
+end
\ No newline at end of file
diff --git a/ext/NNlibLoopVectorizationExt/pooling.jl b/ext/NNlibLoopVectorizationExt/pooling.jl
index 420bb7413..5af194408 100644
--- a/ext/NNlibLoopVectorizationExt/pooling.jl
+++ b/ext/NNlibLoopVectorizationExt/pooling.jl
@@ -13,8 +13,8 @@ function NNlib.meanpool!(output::Array{T,4}, input::Array{T,4}, pdims::PoolDims;
     NNlib.check_dims(size(input), size(output), pdims)
     
     # storing all the necessary shapes
-    input_width, input_height, channels, current_batch_size = size(input)
-    output_width, output_height, channels, current_batch_size = size(output)
+    input_width, input_height, channels, batch_size = size(input)
+    output_width, output_height, channels, batch_size = size(output)
     kernel_width, kernel_height = pdims.kernel_size
 
     x_stride, y_stride = pdims.stride
@@ -26,7 +26,7 @@ function NNlib.meanpool!(output::Array{T,4}, input::Array{T,4}, pdims::PoolDims;
 
     # We use calc_padding_regions to split outselves up into separate regions that may or
     # may not need to worry about padding:
-    pdims_3d = PoolDims((input_width, input_height, 1, channels, current_batch_size), (kernel_width, kernel_height, 1), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1))
+    pdims_3d = PoolDims((input_width, input_height, 1, channels, batch_size), (kernel_width, kernel_height, 1), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1))
     # println(pdims_3d.padding)
     padded_regions, central_region = NNlib.calc_padding_regions(pdims_3d)
 
@@ -40,12 +40,11 @@ function NNlib.meanpool!(output::Array{T,4}, input::Array{T,4}, pdims::PoolDims;
     if pdims.stride == (1, 1) && pdims.dilation == (1, 1) # specialized case for better performance
         # println("specialized case for better performance")
 
-        @tturbo for index_batch in 1:current_batch_size
+        @tturbo for index_batch in 1:batch_size
             # compute pooling for each channel separatly
             for channel in 1:channels, y_out in h_region, x_out in w_region
                 kernel_sum = zero(T)
                 for y_w in 1:kernel_height, x_w in 1:kernel_width
-                    # kernel_sum += input[x_out + x_w - 1, y_out + y_w - 1, channel, index_batch]
                     kernel_sum += input[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, channel, index_batch]
                 end
                 output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha
@@ -55,52 +54,54 @@ function NNlib.meanpool!(output::Array{T,4}, input::Array{T,4}, pdims::PoolDims;
     else # general case for any meanpooling
         # println("general case for any meanpooling")
 
-        @tturbo for index_batch in 1:current_batch_size
+        @tturbo for index_batch in 1:batch_size
             # compute pooling for each channel separatly
             for channel in 1:channels, y_out in h_region, x_out in w_region
                 m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1
                 n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1
                 kernel_sum = zero(T)
                 for y_w in 1:kernel_height, x_w in 1:kernel_width
-                    y_in = m + (y_w - 1) * y_dilation # - y_pad1
-                    x_in = n + (x_w - 1) * x_dilation # - x_pad1
+                    y_in = m + (y_w - 1) * y_dilation
+                    x_in = n + (x_w - 1) * x_dilation
                     kernel_sum += input[x_in, y_in, channel, index_batch]
                 end
                 output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha
             end
         end
 
-    end 
+    end
 
     # Next, the padded regions
-    @inbounds for (w_region, h_region, d_region) in padded_regions
-        for index_batch in 1:current_batch_size, channel in 1:channels
-            for d in d_region # for skipping the d_regions
-            for h in h_region
-            ph = project(h, y_stride, y_pad1)
-            for w in w_region
-            pw = project(w, x_stride, x_pad1)
-            m = zero(T)
-
-                for kh in 1:kernel_height
-                    input_kh = ph + (kh - 1) * y_dilation
-                    if input_kh <= 0 || input_kh > input_height
-                        continue
-                    end
+    if pdims.padding != (0, 0, 0, 0)
+        @inbounds for (w_region, h_region, d_region) in padded_regions
+            for index_batch in 1:batch_size, channel in 1:channels
+                for z_out in d_region # for skipping the d_regions
+                for y_out in h_region
+                m = project(y_out, y_stride, y_pad1)
+                for x_out in w_region
+                n = project(x_out, x_stride, x_pad1)
+                kernel_sum = zero(T)
 
-                    for kw in 1:kernel_width
-                        input_kw = pw + (kw - 1) * x_dilation
-                        if input_kw <= 0 || input_kw > input_width
+                    for y_w in 1:kernel_height
+                        y_in = m + (y_w - 1) * y_dilation
+                        if y_in <= 0 || y_in > input_height
                             continue
                         end
 
-                        m += input[input_kw, input_kh, channel, index_batch]
+                        for x_w in 1:kernel_width
+                            x_in = n + (x_w - 1) * x_dilation
+                            if x_in <= 0 || x_in > input_width
+                                continue
+                            end
+
+                            kernel_sum += input[x_in, y_in, channel, index_batch]
+                        end
                     end
-                end
 
-            output[w, h, channel, index_batch] = _alpha * m
-            end
-            end
+                output[x_out, y_out, channel, index_batch] = _alpha * kernel_sum
+                end
+                end
+                end
             end
         end
     end

From 0aa3a3fe1e727f68fda6013c69b5e5139abdd97d Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Sun, 8 Oct 2023 19:36:37 +0200
Subject: [PATCH 35/37] Add files via upload


From 35f2b772944ba594aa93483691501c79cf65808c Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Sun, 8 Oct 2023 19:36:53 +0200
Subject: [PATCH 36/37] Add files via upload

---
 test/ext_loopvectorization/runtests.jl | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl
index 262ae42d0..f3cc771c8 100644
--- a/test/ext_loopvectorization/runtests.jl
+++ b/test/ext_loopvectorization/runtests.jl
@@ -32,7 +32,7 @@ function compute_pool_outputs(settings::Vector{<:NNlib.PoolDims}, input::Array{T
     return pool_outs
 end
 
-@testset "Convolution & Pooling" begin
+@testset "Convolution & Pooling & Activations" begin
     
     dtype = Float32 # Float64
     batch_size = 64 # 1 # 64 # 32
@@ -44,34 +44,44 @@ end
         NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1), # test 'very specialized case'
         NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(2, 1), padding=(0, 0), dilation=(1, 1), groups=1), # test 'second specialized case'
         NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(2, 1), padding=(0, 0), dilation=(2, 1), groups=1), # test 'third specialized case'
-        ### NNlib.DenseConvDims(size(input), size(weight_grouped), stride=(2, 1), padding=(2, 0), dilation=(2, 1), groups=3), # test 'general case'
-        NNlib.DenseConvDims(size(input), size(weight_grouped), stride=(2, 1), padding=(2, 1), dilation=(2, 1), groups=3), # test 'general case'
+        NNlib.DenseConvDims(size(input), size(weight_grouped), stride=(2, 1), padding=(2, 0), dilation=(2, 1), groups=3), # test 'general case'
     ]
 
     conv_output_grads = [rand(dtype, NNlib.output_size(setting)..., 27, batch_size) for setting in conv_settings_list]
 
     pool_settings_list = [
         NNlib.PoolDims(size(input), (5, 4), stride=(1, 1), padding=(0, 0), dilation=(1, 1)), # test 'specialized case'
-        # NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 0), dilation=(2, 1)), # test 'general case'
-        NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 1), dilation=(2, 1)), # test 'general case'
+        NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 0), dilation=(2, 1)), # test 'general case'
     ]
 
+    modified_activations = [tanh, softsign, softplus, sigmoid, sigmoid_fast, hardsigmoid, logsigmoid, swish]
+
     # compute outputs before loading LoopVectorization
     
     println("without LoopVectorization")
     conv_outs_std, conv_grads_std = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads)
     pool_outs_std = compute_pool_outputs(pool_settings_list, input)
+    act_outs_std = [@btime $act.($input) for act in modified_activations]
 
     using LoopVectorization # now load the NNlibLoopVectorizationExt
 
     println("with LoopVectorization")
     conv_outs_lv, conv_grads_lv = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads)
     pool_outs_lv = compute_pool_outputs(pool_settings_list, input)
+    act_outs_lv = [@btime $act.($input) for act in modified_activations]
 
     # validate conv
-    @test all(isapprox.(conv_outs_std, conv_outs_lv))
-    @test all(isapprox.(conv_grads_std, conv_grads_lv))
+    @testset "Convolution" begin
+        @test all(isapprox.(conv_outs_std, conv_outs_lv))
+        @test all(isapprox.(conv_grads_std, conv_grads_lv))
+    end
     # validate pool
-    @test all(isapprox.(pool_outs_std, pool_outs_lv))
+    @testset "Pooling" begin
+        @test all(isapprox.(pool_outs_std, pool_outs_lv))
+    end
+    # validate activations
+    @testset "Activations" begin
+        @test all(isapprox.(act_outs_std, act_outs_lv))
+    end
 
 end
\ No newline at end of file

From 07943d7213f22dba49b44c8c2762117e6abd1e58 Mon Sep 17 00:00:00 2001
From: Jonas S <74866941+jonas208@users.noreply.github.com>
Date: Mon, 9 Oct 2023 11:49:46 +0200
Subject: [PATCH 37/37] Add files via upload

---
 benchmark_result_julia_BLAS.set_num_threads(1).csv | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 benchmark_result_julia_BLAS.set_num_threads(1).csv

diff --git a/benchmark_result_julia_BLAS.set_num_threads(1).csv b/benchmark_result_julia_BLAS.set_num_threads(1).csv
new file mode 100644
index 000000000..8f84f3c6d
--- /dev/null
+++ b/benchmark_result_julia_BLAS.set_num_threads(1).csv
@@ -0,0 +1,8 @@
+model_names,"acceleration inference, batch_size: 1","acceleration train, batch_size: 1","im2col, inference, batch_size: 1","lv-ext, inference, batch_size: 1","im2col, train, batch_size: 1","lv-ext, train, batch_size: 1","acceleration inference, batch_size: 32","acceleration train, batch_size: 32","im2col, inference, batch_size: 32","lv-ext, inference, batch_size: 32","im2col, train, batch_size: 32","lv-ext, train, batch_size: 32"
+ConvNeXt,2.3560260677286187,1.177474414020855,0.4724393,0.2005238,1.5661168,1.3300644,1.1190087278994194,1.0605556431775283,6.1761583,5.519312,20.2704419,19.1130395
+DenseNet121,3.122060107720349,1.6430011190729519,0.2130831,0.0682508,0.7872384,0.4791466,1.0415097744527333,0.9663276503256204,2.2208416,2.1323291,13.0980777,13.5544892
+EfficientNet,7.620280694206955,2.614653451310472,0.55973705,0.0734536,1.3205672,0.50506395,1.0814317166163767,1.4227466786836107,2.3782218,2.1991419,7.8866889,5.543284
+EfficientNetv2,19.58985619614861,6.567161064233096,3.0016087,0.1532226,12.1968172,1.8572435,1.393368850971046,1.4053620350129883,5.7712793,4.1419609,21.7360648,15.4665234
+MobileNetv3,11.880463635205707,1.4356279307443949,0.1165919,0.00981375,0.31690295,0.2207417,1.2244903507960017,1.0962253667265058,0.4233406,0.345728,1.9161035,1.7479102
+ResNet18,1.8553921481455138,1.1963831276674246,0.0782507,0.04217475,0.26524125,0.2217026,0.7610563817484197,0.8134254105937594,0.5683152,0.7467452,3.1617936,3.8870111
+WideResNet50,1.244074585655623,1.0739855756563101,0.33919395,0.2726476,1.0936016,1.0182647,0.625895474130196,0.8091050354680169,2.59841495,4.1515158,16.8332114,20.8047295