From 6d32073977fb94c4d4fd043f89836b47d43b21f5 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Tue, 26 Sep 2023 20:55:18 +0200 Subject: [PATCH 01/37] Add files via upload --- .../NNlibLoopVectorizationExt.jl | 10 + ext/NNlibLoopVectorizationExt/conv.jl | 181 +++++++++++ ext/NNlibLoopVectorizationExt/conv_old.jl | 304 ++++++++++++++++++ ext/NNlibLoopVectorizationExt/pooling.jl | 109 +++++++ ext/NNlibLoopVectorizationExt/pooling_old.jl | 201 ++++++++++++ 5 files changed, 805 insertions(+) create mode 100644 ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl create mode 100644 ext/NNlibLoopVectorizationExt/conv.jl create mode 100644 ext/NNlibLoopVectorizationExt/conv_old.jl create mode 100644 ext/NNlibLoopVectorizationExt/pooling.jl create mode 100644 ext/NNlibLoopVectorizationExt/pooling_old.jl diff --git a/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl b/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl new file mode 100644 index 000000000..d2053161c --- /dev/null +++ b/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl @@ -0,0 +1,10 @@ +module NNlibLoopVectorizationExt + +using NNlib +using LoopVectorization +using Random, Statistics + +include("conv.jl") +include("pooling.jl") + +end # module \ No newline at end of file diff --git a/ext/NNlibLoopVectorizationExt/conv.jl b/ext/NNlibLoopVectorizationExt/conv.jl new file mode 100644 index 000000000..3aafe9246 --- /dev/null +++ b/ext/NNlibLoopVectorizationExt/conv.jl @@ -0,0 +1,181 @@ +#= +Accelerated convolution for 2d-images using the power of LoopVectorization. +The acceleration is usually greatest when the inputs have a large spatial size and few channels. +Using stride > 1, dilation > 1 or groups > 1 can slow down things a bit. + +Since the current state of LoopVectorization ∇conv_filter! isn't really faster than the +original implementation in some situations, it is left out for the moment. + +Implementation copied from here (Jonas Steinebach, MIT): +https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_convolution.jl +=# + +function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} + + # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise) + size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4]) + cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel) + NNlib.check_dims(size(input), size_weight_check_dims, size(output), cdims_check_dims) + + # padding is done naively at the moment + if cdims.padding != (0, 0, 0, 0) + input = NNlib.pad_zeros(input, cdims.padding, dims=(1, 2)) + end + + output_width, output_height, _ = size(output) + input_width, input_height, in_channels, batches = size(input) + weight_width, weight_height, in_channels_weight, out_channels = size(weight) + + # it's necessary to flip the kernel if real convolution is performed (flipkernel=false) + if !NNlib.flipkernel(cdims) + weight = reverse(weight, dims=(1, 2)) + end + + groups = cdims.groupcount + x_stride, y_stride = cdims.stride + x_dilation, y_dilation = cdims.dilation + out_channels_per_group = out_channels ÷ groups + + if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance + # println("forward: very specialized case for maximum performance") + + @tturbo for index_batch in 1:batches + for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width + value = zero(T) + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + value += input[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] + end + output[x_out, y_out, out_channel, index_batch] = value + end + end + + elseif groups == 1 # second specialized case for better performance + # println("forward: second specialized case for better performance") + + @tturbo for index_batch in 1:batches + for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) + value = zero(T) + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation + value += input[x_in, y_in, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] + end + output[x_out, y_out, out_channel, index_batch] = value + end + end + + else # general case for any convolution + # println("forward: general case for any convolution") + + @tturbo for index_batch in 1:batches + for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in 1:output_height, x_out in 1:output_width + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) + out_channel = (group * out_channels_per_group + 1) - out_channel_per_group + value = zero(T) + for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation + in_channel_input = in_channel_weight + (group - 1) * in_channels_weight + value += input[x_in, y_in, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel] + end + output[x_out, y_out, out_channel, index_batch] = value + end + end + + end + + return output +end + +function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} + + # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise) + size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4]) + cdims_check_dims = DenseConvDims(size(input_gradient), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel) + NNlib.check_dims(size(input_gradient), size_weight_check_dims, size(output_gradient), cdims_check_dims) + + # storing all the necessary shapes + output_width, output_height, out_channels, current_batch_size = size(output_gradient) + weight_width, weight_height, in_channels_weight, out_channels = size(weight) + + # because in the actual computation section, values are added, it's saver to reset the given input_gradient first + input_gradient .= zero(T) + # check if input_gradient must be padded (padding is done naively at the moment) + if cdims.padding != (0, 0, 0, 0) + input_gradient_padded = NNlib.pad_zeros(input_gradient, cdims.padding, dims=(1, 2)) + else + input_gradient_padded = input_gradient + end + + # store the size of input after padding + input_width, input_height, in_channels, current_batch_size = size(input_gradient_padded) # size after padding + + # it's necessary to flip the kernel if real convolution is performed (flipkernel=false) + if !NNlib.flipkernel(cdims) + weight = reverse(weight, dims=(1, 2)) + end + + groups = cdims.groupcount + x_stride, y_stride = cdims.stride + x_dilation, y_dilation = cdims.dilation + out_channels_per_group = out_channels ÷ groups + + # actual computation (using @tturbo instead of Threads.@threads + @turbo may end up in wrong results) + if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance + # println("backward: very specialized case for maximum performance") + + Threads.@threads for index_batch in 1:current_batch_size + @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + input_gradient_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] + end + end + end + + elseif groups == 1 # second specialized case for better performance + # println("backward: second specialized case for better performance") + + Threads.@threads for index_batch in 1:current_batch_size + @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation + input_gradient_padded[x_in, y_in, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] + end + end + end + + else # general case for any convolution + # println("backward: general case for any convolution") + + Threads.@threads for index_batch in 1:current_batch_size + for out_channel_per_group in 1:out_channels_per_group # putting @turbo here may end up in wrong results + @turbo for group in 1:groups, y_out in 1:output_height, x_out in 1:output_width + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) + out_channel = (group * out_channels_per_group + 1) - out_channel_per_group + for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation + in_channel_input = in_channel_weight + (group - 1) * in_channels_weight + input_gradient_padded[x_in, y_in, in_channel_input, index_batch] += weight[x_w, y_w, in_channel_weight, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] + end + end + end + end + + end + + # depad + if cdims.padding != (0, 0, 0, 0) + x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding + input_gradient .= input_gradient_padded[x_pad1+1:input_width-x_pad2, y_pad1+1:input_height-y_pad2, :, :] + end + + return input_gradient +end \ No newline at end of file diff --git a/ext/NNlibLoopVectorizationExt/conv_old.jl b/ext/NNlibLoopVectorizationExt/conv_old.jl new file mode 100644 index 000000000..18cc294b6 --- /dev/null +++ b/ext/NNlibLoopVectorizationExt/conv_old.jl @@ -0,0 +1,304 @@ +#= +Implementation copied from here (Jonas Steinebach, MIT): +https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_convolution.jl +Could include bias & activation too, hence overload `conv_bias_act`, +at the cost of needing gradient rules for that. +=# + +function zero_pad_2d(input::AbstractArray{T, 4}, padding::NTuple{4, Int}) where {T <: Number} + width, height, channels, current_batch_size = size(input) + x_pad1, x_pad2, y_pad1, y_pad2 = padding + output_height, output_width = height + y_pad1 + y_pad2, width + x_pad1 + x_pad2 + output = zeros(T, output_width, output_height, channels, current_batch_size) + output[x_pad1 + 1:output_width - x_pad2, y_pad1 + 1:output_height - y_pad2, :, :] = input + + return output +end + +function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} + # println("myconv called") + + size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4]) + cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel) + NNlib.check_dims(size(input), size_weight_check_dims, size(output), cdims_check_dims) + + if cdims.padding != (0, 0, 0, 0) + #= + invoke(NNlib.conv!, + Tuple{AbstractArray{T,4},AbstractArray{T,4},AbstractArray{T,4},ConvDims}, + output, input, weight, cdims; kw...) + =# + input = zero_pad_2d(input, cdims.padding) + end + + output_width, output_height, _ = size(output) + input_width, input_height, in_channels, batches = size(input) + weight_width, weight_height, in_channels_weight, out_channels = size(weight) + + if !NNlib.flipkernel(cdims) + weight = reverse(weight, dims=(1, 2)) + end + + groups = cdims.groupcount + x_stride, y_stride = cdims.stride + x_dilation, y_dilation = cdims.dilation + out_channels_per_group = out_channels ÷ groups + + if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) + # println("very specialized case for maximum performance") + + @tturbo for index_batch in 1:batches + for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width + value = zero(T) + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + value += input[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] + end + output[x_out, y_out, out_channel, index_batch] = value #+ bias[out_channel] + end + end + + elseif groups == 1 + # println("second specialized case for better performance") + + @tturbo for index_batch in 1:batches + for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) + value = zero(T) + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation + value += input[x_in, y_in, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] + end + output[x_out, y_out, out_channel, index_batch] = value #+ bias[out_channel] + end + end + + else + # println("general case for any convolution") + + @tturbo for index_batch in 1:batches + for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in 1:output_height, x_out in 1:output_width + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) + out_channel = (group * out_channels_per_group + 1) - out_channel_per_group + value = zero(T) + for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation + in_channel_input = in_channel_weight + (group - 1) * in_channels_weight + value += input[x_in, y_in, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel] + end + output[x_out, y_out, out_channel, index_batch] = value #+ bias[out_channel] + end + end + end + + return output +end + +function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} + # println("myconv data back called") + + size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4]) + cdims_check_dims = DenseConvDims(size(input_gradient), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel) + NNlib.check_dims(size(input_gradient), size_weight_check_dims, size(output_gradient), cdims_check_dims) + + # storing all the necessary shapes + output_width, output_height, out_channels, current_batch_size = size(output_gradient) + weight_width, weight_height, in_channels_weight, out_channels = size(weight) + # because in the actual computation section, values are added, it's saver to reset the given input_gradient first + input_gradient .= zero(T) + # check if input_gradient must be padded + if cdims.padding != (0, 0, 0, 0) + input_gradient_padded = zero_pad_2d(input_gradient, cdims.padding) + else + input_gradient_padded = input_gradient + end + # store the size of input after padding + input_width, input_height, in_channels, current_batch_size = size(input_gradient_padded) # size after padding + + if !NNlib.flipkernel(cdims) + weight = reverse(weight, dims=(1, 2)) + end + + groups = cdims.groupcount + x_stride, y_stride = cdims.stride + x_dilation, y_dilation = cdims.dilation + out_channels_per_group = out_channels ÷ groups + # actual computation + if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance + # println("very specialized case for maximum performance") + Threads.@threads for index_batch in 1:current_batch_size + @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + input_gradient_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] + end + end + end + elseif groups == 1 # second specialized case for better performance + # println("second specialized case for better performance") + Threads.@threads for index_batch in 1:current_batch_size + @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation + input_gradient_padded[x_in, y_in, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] + end + end + end + else # general case for any convolution + # println("general case for any convolution") + Threads.@threads for index_batch in 1:current_batch_size + for out_channel_per_group in 1:out_channels_per_group # putting @turbo here may end up in wrong results + @turbo for group in 1:groups, y_out in 1:output_height, x_out in 1:output_width + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) + out_channel = (group * out_channels_per_group + 1) - out_channel_per_group + for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation + in_channel_input = in_channel_weight + (group - 1) * in_channels_weight + input_gradient_padded[x_in, y_in, in_channel_input, index_batch] += weight[x_w, y_w, in_channel_weight, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] + end + end + end + end + end + + # depad + if cdims.padding != (0, 0, 0, 0) + x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding + input_gradient .= input_gradient_padded[x_pad1+1:input_width-x_pad2, y_pad1+1:input_height-y_pad2, :, :] + end + + return input_gradient +end + +#= +function NNlib.∇conv_filter!(weight_gradient::Array{T,4}, input::Array{T,4}, output_gradient::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} + # println("myconv filter back called") + + size_weight_check_dims = (size(weight_gradient)[1:2]..., size(weight_gradient)[3]*cdims.groupcount, size(weight_gradient)[4]) + cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel) + NNlib.check_dims(size(input), size_weight_check_dims, size(output_gradient), cdims_check_dims) + + # storing all the necessary shapes + input_width, input_height, in_channels, current_batch_size = size(input) + output_width, output_height, out_channels, current_batch_size = size(output_gradient) + weight_width, weight_height, in_channels_weight, out_channels = size(weight_gradient) + + # check if input must be padded + if cdims.padding != (0, 0, 0, 0) + input_padded = zero_pad_2d(input, cdims.padding) + else + input_padded = input + end + + groups = cdims.groupcount + x_stride, y_stride = cdims.stride + x_dilation, y_dilation = cdims.dilation + out_channels_per_group = out_channels ÷ groups + # actual computation + if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance + # println("very specialized case for maximum performance") + #= + @tturbo for out_channel in 1:out_channels + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + value = zero(T) + for index_batch in 1:current_batch_size, y_out in 1:output_height, x_out in 1:output_width + value += input_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch] + end + weight_gradient[x_w, y_w, in_channel, out_channel] = value + end + end + =# + weight_gradient_batched = zeros(T, weight_width, weight_height, in_channels_weight, out_channels, current_batch_size) + @tturbo for index_batch in 1:current_batch_size # Threads.@threads + for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width # @turbo + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + weight_gradient_batched[x_w, y_w, in_channel, out_channel, index_batch] += input_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch] + end + end + end + # weight_gradient .= @time dropdims(sum(weight_gradient_batched, dims=5), dims=5) + weight_gradient .= dropdims(sum(weight_gradient_batched, dims=5), dims=5) + elseif groups == 1 # second specialized case for better performance + # println("second specialized case for better performance") + #= + @tturbo for out_channel in 1:out_channels + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + value = zero(T) + for index_batch in 1:current_batch_size, y_out in 1:output_height, x_out in 1:output_width + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation + value += input_padded[x_in, y_in, in_channel, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch] + end + weight_gradient[x_w, y_w, in_channel, out_channel] = value + end + end + =# + weight_gradient_batched = zeros(T, weight_width, weight_height, in_channels_weight, out_channels, current_batch_size) + @tturbo for index_batch in 1:current_batch_size # Threads.@threads + for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width # @turbo + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation + weight_gradient_batched[x_w, y_w, in_channel, out_channel, index_batch] += input_padded[x_in, y_in, in_channel, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch] + end + end + end + # weight_gradient .= @time dropdims(sum(weight_gradient_batched, dims=5), dims=5) + weight_gradient .= dropdims(sum(weight_gradient_batched, dims=5), dims=5) + else # general case for any convolution + # println("general case for any convolution") + #= + @tturbo for out_channel_per_group in 1:out_channels_per_group + for group in 1:groups, in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width + value = zero(T) + for index_batch in 1:current_batch_size, y_out in 1:output_height, x_out in 1:output_width + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) + out_channel = (group * out_channels_per_group + 1) - out_channel_per_group + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation + in_channel_input = in_channel_weight + (group - 1) * in_channels_weight + value += input_padded[x_in, y_in, in_channel_input, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch] + end + weight_gradient[x_w, y_w, in_channel_weight, out_channel] = value + end + end + =# + weight_gradient_batched = zeros(T, weight_width, weight_height, in_channels_weight, out_channels, current_batch_size) + @tturbo for index_batch in 1:current_batch_size # Threads.@threads + for out_channel_per_group in 1:out_channels_per_group # @turbo + for group in 1:groups, y_out in 1:output_height, x_out in 1:output_width + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) + out_channel = (group * out_channels_per_group + 1) - out_channel_per_group + for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation + in_channel_input = in_channel_weight + (group - 1) * in_channels_weight + weight_gradient_batched[x_w, y_w, in_channel_weight, out_channel, index_batch] += input_padded[x_in, y_in, in_channel_input, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch] + end + end + end + end + # weight_gradient .= @time dropdims(sum(weight_gradient_batched, dims=5), dims=5) + weight_gradient .= dropdims(sum(weight_gradient_batched, dims=5), dims=5) + end + + if !NNlib.flipkernel(cdims) + weight_gradient = reverse(weight_gradient, dims=(1, 2)) + end + + return weight_gradient +end +=# \ No newline at end of file diff --git a/ext/NNlibLoopVectorizationExt/pooling.jl b/ext/NNlibLoopVectorizationExt/pooling.jl new file mode 100644 index 000000000..dc948335b --- /dev/null +++ b/ext/NNlibLoopVectorizationExt/pooling.jl @@ -0,0 +1,109 @@ +#= +Accelerated mean pooling for 2d-images using the power of LoopVectorization. +The speed up is usually lower compared to conv but can be approximately up to 2x. + +Since the current state of LoopVectorization ∇meanpool! isn't really faster than the +original implementation in some situations, it is left out for the moment. + +Implementation inspired from here (Jonas Steinebach, MIT): +https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_pooling.jl +=# + +function NNlib.meanpool!(output::Array{T,4}, input::Array{T,4}, pdims::PoolDims; kw...) where {T<:Real} + NNlib.check_dims(size(input), size(output), pdims) + + # storing all the necessary shapes + input_width, input_height, channels, current_batch_size = size(input) + output_width, output_height, channels, current_batch_size = size(output) + kernel_width, kernel_height = pdims.kernel_size + + x_stride, y_stride = pdims.stride + x_dilation, y_dilation = pdims.dilation + x_pad1, x_pad2, y_pad1, y_pad2 = pdims.padding + + # A helper function to project from output (w, h) to input (input_w, input_h) + @inline project(idx, stride, pad) = (idx - 1) * stride - pad + 1 + + # We use calc_padding_regions to split outselves up into separate regions that may or + # may not need to worry about padding: + pdims_3d = PoolDims((input_width, input_height, 1, channels, current_batch_size), (kernel_width, kernel_height, 1), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1)) + # println(pdims_3d.padding) + padded_regions, central_region = NNlib.calc_padding_regions(pdims_3d) + + # We represent division by kernel size by rolling it + # into the `alpha` multiplier. + _alpha = T(1 / prod(pdims.kernel_size)) + + # Start with the central region + w_region, h_region, _ = central_region + + if pdims.stride == (1, 1) && pdims.dilation == (1, 1) # specialized case for better performance + # println("specialized case for better performance") + + @tturbo for index_batch in 1:current_batch_size + # compute pooling for each channel separatly + for channel in 1:channels, y_out in h_region, x_out in w_region + kernel_sum = zero(T) + for y_w in 1:kernel_height, x_w in 1:kernel_width + # kernel_sum += input[x_out + x_w - 1, y_out + y_w - 1, channel, index_batch] + kernel_sum += input[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, channel, index_batch] + end + output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha + end + end + + else # general case for any meanpooling + # println("general case for any meanpooling") + + @tturbo for index_batch in 1:current_batch_size + # compute pooling for each channel separatly + for channel in 1:channels, y_out in h_region, x_out in w_region + m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1 + n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1 + kernel_sum = zero(T) + for y_w in 1:kernel_height, x_w in 1:kernel_width + y_in = m + (y_w - 1) * y_dilation # - y_pad1 + x_in = n + (x_w - 1) * x_dilation # - x_pad1 + kernel_sum += input[x_in, y_in, channel, index_batch] + end + output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha + end + end + + end + + # Next, the padded regions + @inbounds for (w_region, h_region, d_region) in padded_regions + for index_batch in 1:current_batch_size, channel in 1:channels + for d in d_region # for skipping the d_regions + for h in h_region + ph = project(h, y_stride, y_pad1) + for w in w_region + pw = project(w, x_stride, x_pad1) + m = zero(T) + + for kh in 1:kernel_height + input_kh = ph + (kh - 1) * y_dilation + if input_kh <= 0 || input_kh > input_height + continue + end + + for kw in 1:kernel_width + input_kw = pw + (kw - 1) * x_dilation + if input_kw <= 0 || input_kw > input_width + continue + end + + m += input[input_kw, input_kh, channel, index_batch] + end + end + + output[w, h, channel, index_batch] = _alpha * m + end + end + end + end + end + + return output +end \ No newline at end of file diff --git a/ext/NNlibLoopVectorizationExt/pooling_old.jl b/ext/NNlibLoopVectorizationExt/pooling_old.jl new file mode 100644 index 000000000..dc8fe34d5 --- /dev/null +++ b/ext/NNlibLoopVectorizationExt/pooling_old.jl @@ -0,0 +1,201 @@ +#= +Accelerated mean pooling for 2d-images using the power of LoopVectorization. +The speed up is lower compared to conv but can be approximately up to 2x. + +Since the current state of LoopVectorization ∇meanpool! isn't really faster than the +original implementation in some situations, it is left out for the moment. + +Implementation inspired from here (Jonas Steinebach, MIT): +https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_pooling.jl +=# + +function NNlib.meanpool!(output::Array{T,4}, input::Array{T,4}, pdims::PoolDims; kw...) where {T<:Real} + # storing all the necessary shapes + input_width, input_height, channels, current_batch_size = size(input) + output_width, output_height, channels, current_batch_size = size(output) + kernel_width, kernel_height = pdims.kernel_size + + x_stride, y_stride = pdims.stride + x_dilation, y_dilation = pdims.dilation + x_pad1, x_pad2, y_pad1, y_pad2 = pdims.padding + + # A helper function to project from output (w, h) to input (input_w, input_h) + @inline project(idx, stride, pad) = (idx - 1) * stride - pad + 1 + + # We use calc_padding_regions to split outselves up into separate regions that may or + # may not need to worry about padding: + pdims_3d = PoolDims((input_width, input_height, 1, channels, current_batch_size), (kernel_width, kernel_height, 1), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1)) + # println(pdims_3d.padding) + padded_regions, central_region = NNlib.calc_padding_regions(pdims_3d) + + # We represent division by kernel size by rolling it + # into the `alpha` multiplier. + _alpha = T(1 / prod(pdims.kernel_size)) + + # Start with the central region + w_region, h_region, _ = central_region + + if pdims.stride == (1, 1) && pdims.dilation == (1, 1) # specialized case for better performance + + @tturbo for index_batch in 1:current_batch_size + # compute pooling for each channel separatly + for channel in 1:channels, y_out in h_region, x_out in w_region + kernel_sum = zero(T) + for y_w in 1:kernel_height, x_w in 1:kernel_width + # kernel_sum += input[x_out + x_w - 1, y_out + y_w - 1, channel, index_batch] + kernel_sum += input[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, channel, index_batch] + end + output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha + end + end + + else # general case for any meanpooling + + @tturbo for index_batch in 1:current_batch_size + # compute pooling for each channel separatly + for channel in 1:channels, y_out in h_region, x_out in w_region + m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1 + n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1 + kernel_sum = zero(T) + for y_w in 1:kernel_height, x_w in 1:kernel_width + y_in = m + (y_w - 1) * y_dilation # - y_pad1 + x_in = n + (x_w - 1) * x_dilation # - x_pad1 + kernel_sum += input[x_in, y_in, channel, index_batch] + end + output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha + end + end + + end + + # Next, the padded regions + @inbounds for (w_region, h_region, d_region) in padded_regions + for index_batch in 1:current_batch_size, channel in 1:channels + for d in d_region # for skipping the d_regions + for h in h_region + ph = project(h, y_stride, y_pad1) + for w in w_region + pw = project(w, x_stride, x_pad1) + m = zero(T) + + for kh in 1:kernel_height + input_kh = ph + (kh - 1) * y_dilation + if input_kh <= 0 || input_kh > input_height + continue + end + + for kw in 1:kernel_width + input_kw = pw + (kw - 1) * x_dilation + if input_kw <= 0 || input_kw > input_width + continue + end + + m += input[input_kw, input_kh, channel, index_batch] + end + end + + output[w, h, channel, index_batch] = _alpha * m + end + end + end + end + end + + return output +end + +function NNlib.∇meanpool!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, output::Array{T,4}, input::Array{T,4}, pdims::PoolDims; kw...) where {T<:Real} + # storing all the necessary shapes + input_width, input_height, channels, current_batch_size = size(input_gradient) + output_width, output_height, channels, current_batch_size = size(output_gradient) + kernel_width, kernel_height = pdims.kernel_size + + # because in the actual computation section, values are added, it's saver to reset the given input_gradient first + input_gradient .= zero(T) + + x_stride, y_stride = pdims.stride + x_dilation, y_dilation = pdims.dilation + x_pad1, x_pad2, y_pad1, y_pad2 = pdims.padding + + # A helper function to project from output (w, h) to input (input_w, input_h) + @inline project(idx, stride, pad) = (idx - 1) * stride - pad + 1 + + # We use calc_padding_regions to split outselves up into separate regions that may or + # may not need to worry about padding: + pdims_3d = PoolDims((input_width, input_height, 1, channels, current_batch_size), (kernel_width, kernel_height, 1), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1)) + # println(pdims_3d.padding) + padded_regions, central_region = NNlib.calc_padding_regions(pdims_3d) + + # We represent division by kernel size by rolling it + # into the `alpha` multiplier. + _alpha = T(1 / prod(pdims.kernel_size)) + + # Start with the central region + w_region, h_region, _ = central_region + + if pdims.stride == (1, 1) && pdims.dilation == (1, 1) # specialized case for better performance + + @tturbo for index_batch in 1:current_batch_size + # compute pooling for each channel separatly + for channel in 1:channels, y_out in h_region, x_out in w_region + output_gradient_value = output_gradient[x_out, y_out, channel, index_batch] + for y_w in 1:kernel_height, x_w in 1:kernel_width + input_gradient[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, channel, index_batch] += output_gradient_value * _alpha + end + end + end + + else # general case for any meanpooling + + @tturbo for index_batch in 1:current_batch_size + # compute pooling for each channel separatly + for channel in 1:channels, y_out in h_region, x_out in w_region + output_gradient_value = output_gradient[x_out, y_out, channel, index_batch] + m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1 + n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1 + for y_w in 1:kernel_height, x_w in 1:kernel_width + y_in = m + (y_w - 1) * y_dilation # - y_pad1 + x_in = n + (x_w - 1) * x_dilation # - x_pad1 + input_gradient[x_in, y_in, channel, index_batch] += output_gradient_value * _alpha + end + end + end + + end + + # Next, the padded regions + @inbounds for (w_region, h_region, d_region) in padded_regions + for index_batch in 1:current_batch_size, channel in 1:channels + for d in d_region # for skipping the d_regions + for h in h_region + ph = project(h, y_stride, y_pad1) + for w in w_region + pw = project(w, x_stride, x_pad1) + + # Grab the incoming gradient at this index for future use + output_gradient_value = output_gradient[w, h, channel, index_batch] + + for kh in 1:kernel_height + input_kh = ph + (kh - 1) * y_dilation + if input_kh <= 0 || input_kh > input_height + continue + end + + for kw in 1:kernel_width + input_kw = pw + (kw - 1) * x_dilation + if input_kw <= 0 || input_kw > input_width + continue + end + + input_gradient[input_kw, input_kh, channel, index_batch] += output_gradient_value * _alpha + end + end + + end + end + end + end + end + + return input_gradient +end \ No newline at end of file From 2a3cf3eecf892c0fba0904567d678813f1587766 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Tue, 26 Sep 2023 20:55:50 +0200 Subject: [PATCH 02/37] Delete ext/NNlibLoopVectorizationExt/conv_old.jl --- ext/NNlibLoopVectorizationExt/conv_old.jl | 304 ---------------------- 1 file changed, 304 deletions(-) delete mode 100644 ext/NNlibLoopVectorizationExt/conv_old.jl diff --git a/ext/NNlibLoopVectorizationExt/conv_old.jl b/ext/NNlibLoopVectorizationExt/conv_old.jl deleted file mode 100644 index 18cc294b6..000000000 --- a/ext/NNlibLoopVectorizationExt/conv_old.jl +++ /dev/null @@ -1,304 +0,0 @@ -#= -Implementation copied from here (Jonas Steinebach, MIT): -https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_convolution.jl -Could include bias & activation too, hence overload `conv_bias_act`, -at the cost of needing gradient rules for that. -=# - -function zero_pad_2d(input::AbstractArray{T, 4}, padding::NTuple{4, Int}) where {T <: Number} - width, height, channels, current_batch_size = size(input) - x_pad1, x_pad2, y_pad1, y_pad2 = padding - output_height, output_width = height + y_pad1 + y_pad2, width + x_pad1 + x_pad2 - output = zeros(T, output_width, output_height, channels, current_batch_size) - output[x_pad1 + 1:output_width - x_pad2, y_pad1 + 1:output_height - y_pad2, :, :] = input - - return output -end - -function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} - # println("myconv called") - - size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4]) - cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel) - NNlib.check_dims(size(input), size_weight_check_dims, size(output), cdims_check_dims) - - if cdims.padding != (0, 0, 0, 0) - #= - invoke(NNlib.conv!, - Tuple{AbstractArray{T,4},AbstractArray{T,4},AbstractArray{T,4},ConvDims}, - output, input, weight, cdims; kw...) - =# - input = zero_pad_2d(input, cdims.padding) - end - - output_width, output_height, _ = size(output) - input_width, input_height, in_channels, batches = size(input) - weight_width, weight_height, in_channels_weight, out_channels = size(weight) - - if !NNlib.flipkernel(cdims) - weight = reverse(weight, dims=(1, 2)) - end - - groups = cdims.groupcount - x_stride, y_stride = cdims.stride - x_dilation, y_dilation = cdims.dilation - out_channels_per_group = out_channels ÷ groups - - if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) - # println("very specialized case for maximum performance") - - @tturbo for index_batch in 1:batches - for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width - value = zero(T) - for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width - value += input[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] - end - output[x_out, y_out, out_channel, index_batch] = value #+ bias[out_channel] - end - end - - elseif groups == 1 - # println("second specialized case for better performance") - - @tturbo for index_batch in 1:batches - for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width - m = y_out + (y_stride - 1) * (y_out - 1) - n = x_out + (x_stride - 1) * (x_out - 1) - value = zero(T) - for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width - y_in = m + (y_w - 1) * y_dilation - x_in = n + (x_w - 1) * x_dilation - value += input[x_in, y_in, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] - end - output[x_out, y_out, out_channel, index_batch] = value #+ bias[out_channel] - end - end - - else - # println("general case for any convolution") - - @tturbo for index_batch in 1:batches - for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in 1:output_height, x_out in 1:output_width - m = y_out + (y_stride - 1) * (y_out - 1) - n = x_out + (x_stride - 1) * (x_out - 1) - out_channel = (group * out_channels_per_group + 1) - out_channel_per_group - value = zero(T) - for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width - y_in = m + (y_w - 1) * y_dilation - x_in = n + (x_w - 1) * x_dilation - in_channel_input = in_channel_weight + (group - 1) * in_channels_weight - value += input[x_in, y_in, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel] - end - output[x_out, y_out, out_channel, index_batch] = value #+ bias[out_channel] - end - end - end - - return output -end - -function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} - # println("myconv data back called") - - size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4]) - cdims_check_dims = DenseConvDims(size(input_gradient), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel) - NNlib.check_dims(size(input_gradient), size_weight_check_dims, size(output_gradient), cdims_check_dims) - - # storing all the necessary shapes - output_width, output_height, out_channels, current_batch_size = size(output_gradient) - weight_width, weight_height, in_channels_weight, out_channels = size(weight) - # because in the actual computation section, values are added, it's saver to reset the given input_gradient first - input_gradient .= zero(T) - # check if input_gradient must be padded - if cdims.padding != (0, 0, 0, 0) - input_gradient_padded = zero_pad_2d(input_gradient, cdims.padding) - else - input_gradient_padded = input_gradient - end - # store the size of input after padding - input_width, input_height, in_channels, current_batch_size = size(input_gradient_padded) # size after padding - - if !NNlib.flipkernel(cdims) - weight = reverse(weight, dims=(1, 2)) - end - - groups = cdims.groupcount - x_stride, y_stride = cdims.stride - x_dilation, y_dilation = cdims.dilation - out_channels_per_group = out_channels ÷ groups - # actual computation - if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance - # println("very specialized case for maximum performance") - Threads.@threads for index_batch in 1:current_batch_size - @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width - for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width - input_gradient_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] - end - end - end - elseif groups == 1 # second specialized case for better performance - # println("second specialized case for better performance") - Threads.@threads for index_batch in 1:current_batch_size - @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width - m = y_out + (y_stride - 1) * (y_out - 1) - n = x_out + (x_stride - 1) * (x_out - 1) - for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width - y_in = m + (y_w - 1) * y_dilation - x_in = n + (x_w - 1) * x_dilation - input_gradient_padded[x_in, y_in, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] - end - end - end - else # general case for any convolution - # println("general case for any convolution") - Threads.@threads for index_batch in 1:current_batch_size - for out_channel_per_group in 1:out_channels_per_group # putting @turbo here may end up in wrong results - @turbo for group in 1:groups, y_out in 1:output_height, x_out in 1:output_width - m = y_out + (y_stride - 1) * (y_out - 1) - n = x_out + (x_stride - 1) * (x_out - 1) - out_channel = (group * out_channels_per_group + 1) - out_channel_per_group - for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width - y_in = m + (y_w - 1) * y_dilation - x_in = n + (x_w - 1) * x_dilation - in_channel_input = in_channel_weight + (group - 1) * in_channels_weight - input_gradient_padded[x_in, y_in, in_channel_input, index_batch] += weight[x_w, y_w, in_channel_weight, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] - end - end - end - end - end - - # depad - if cdims.padding != (0, 0, 0, 0) - x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding - input_gradient .= input_gradient_padded[x_pad1+1:input_width-x_pad2, y_pad1+1:input_height-y_pad2, :, :] - end - - return input_gradient -end - -#= -function NNlib.∇conv_filter!(weight_gradient::Array{T,4}, input::Array{T,4}, output_gradient::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} - # println("myconv filter back called") - - size_weight_check_dims = (size(weight_gradient)[1:2]..., size(weight_gradient)[3]*cdims.groupcount, size(weight_gradient)[4]) - cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel) - NNlib.check_dims(size(input), size_weight_check_dims, size(output_gradient), cdims_check_dims) - - # storing all the necessary shapes - input_width, input_height, in_channels, current_batch_size = size(input) - output_width, output_height, out_channels, current_batch_size = size(output_gradient) - weight_width, weight_height, in_channels_weight, out_channels = size(weight_gradient) - - # check if input must be padded - if cdims.padding != (0, 0, 0, 0) - input_padded = zero_pad_2d(input, cdims.padding) - else - input_padded = input - end - - groups = cdims.groupcount - x_stride, y_stride = cdims.stride - x_dilation, y_dilation = cdims.dilation - out_channels_per_group = out_channels ÷ groups - # actual computation - if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance - # println("very specialized case for maximum performance") - #= - @tturbo for out_channel in 1:out_channels - for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width - value = zero(T) - for index_batch in 1:current_batch_size, y_out in 1:output_height, x_out in 1:output_width - value += input_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch] - end - weight_gradient[x_w, y_w, in_channel, out_channel] = value - end - end - =# - weight_gradient_batched = zeros(T, weight_width, weight_height, in_channels_weight, out_channels, current_batch_size) - @tturbo for index_batch in 1:current_batch_size # Threads.@threads - for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width # @turbo - for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width - weight_gradient_batched[x_w, y_w, in_channel, out_channel, index_batch] += input_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch] - end - end - end - # weight_gradient .= @time dropdims(sum(weight_gradient_batched, dims=5), dims=5) - weight_gradient .= dropdims(sum(weight_gradient_batched, dims=5), dims=5) - elseif groups == 1 # second specialized case for better performance - # println("second specialized case for better performance") - #= - @tturbo for out_channel in 1:out_channels - for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width - value = zero(T) - for index_batch in 1:current_batch_size, y_out in 1:output_height, x_out in 1:output_width - m = y_out + (y_stride - 1) * (y_out - 1) - n = x_out + (x_stride - 1) * (x_out - 1) - y_in = m + (y_w - 1) * y_dilation - x_in = n + (x_w - 1) * x_dilation - value += input_padded[x_in, y_in, in_channel, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch] - end - weight_gradient[x_w, y_w, in_channel, out_channel] = value - end - end - =# - weight_gradient_batched = zeros(T, weight_width, weight_height, in_channels_weight, out_channels, current_batch_size) - @tturbo for index_batch in 1:current_batch_size # Threads.@threads - for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width # @turbo - m = y_out + (y_stride - 1) * (y_out - 1) - n = x_out + (x_stride - 1) * (x_out - 1) - for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width - y_in = m + (y_w - 1) * y_dilation - x_in = n + (x_w - 1) * x_dilation - weight_gradient_batched[x_w, y_w, in_channel, out_channel, index_batch] += input_padded[x_in, y_in, in_channel, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch] - end - end - end - # weight_gradient .= @time dropdims(sum(weight_gradient_batched, dims=5), dims=5) - weight_gradient .= dropdims(sum(weight_gradient_batched, dims=5), dims=5) - else # general case for any convolution - # println("general case for any convolution") - #= - @tturbo for out_channel_per_group in 1:out_channels_per_group - for group in 1:groups, in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width - value = zero(T) - for index_batch in 1:current_batch_size, y_out in 1:output_height, x_out in 1:output_width - m = y_out + (y_stride - 1) * (y_out - 1) - n = x_out + (x_stride - 1) * (x_out - 1) - out_channel = (group * out_channels_per_group + 1) - out_channel_per_group - y_in = m + (y_w - 1) * y_dilation - x_in = n + (x_w - 1) * x_dilation - in_channel_input = in_channel_weight + (group - 1) * in_channels_weight - value += input_padded[x_in, y_in, in_channel_input, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch] - end - weight_gradient[x_w, y_w, in_channel_weight, out_channel] = value - end - end - =# - weight_gradient_batched = zeros(T, weight_width, weight_height, in_channels_weight, out_channels, current_batch_size) - @tturbo for index_batch in 1:current_batch_size # Threads.@threads - for out_channel_per_group in 1:out_channels_per_group # @turbo - for group in 1:groups, y_out in 1:output_height, x_out in 1:output_width - m = y_out + (y_stride - 1) * (y_out - 1) - n = x_out + (x_stride - 1) * (x_out - 1) - out_channel = (group * out_channels_per_group + 1) - out_channel_per_group - for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width - y_in = m + (y_w - 1) * y_dilation - x_in = n + (x_w - 1) * x_dilation - in_channel_input = in_channel_weight + (group - 1) * in_channels_weight - weight_gradient_batched[x_w, y_w, in_channel_weight, out_channel, index_batch] += input_padded[x_in, y_in, in_channel_input, index_batch] * output_gradient[x_out, y_out, out_channel, index_batch] - end - end - end - end - # weight_gradient .= @time dropdims(sum(weight_gradient_batched, dims=5), dims=5) - weight_gradient .= dropdims(sum(weight_gradient_batched, dims=5), dims=5) - end - - if !NNlib.flipkernel(cdims) - weight_gradient = reverse(weight_gradient, dims=(1, 2)) - end - - return weight_gradient -end -=# \ No newline at end of file From 28a027a703b649532c641927ca085aa3c76dbca9 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Tue, 26 Sep 2023 20:56:01 +0200 Subject: [PATCH 03/37] Delete ext/NNlibLoopVectorizationExt/pooling_old.jl --- ext/NNlibLoopVectorizationExt/pooling_old.jl | 201 ------------------- 1 file changed, 201 deletions(-) delete mode 100644 ext/NNlibLoopVectorizationExt/pooling_old.jl diff --git a/ext/NNlibLoopVectorizationExt/pooling_old.jl b/ext/NNlibLoopVectorizationExt/pooling_old.jl deleted file mode 100644 index dc8fe34d5..000000000 --- a/ext/NNlibLoopVectorizationExt/pooling_old.jl +++ /dev/null @@ -1,201 +0,0 @@ -#= -Accelerated mean pooling for 2d-images using the power of LoopVectorization. -The speed up is lower compared to conv but can be approximately up to 2x. - -Since the current state of LoopVectorization ∇meanpool! isn't really faster than the -original implementation in some situations, it is left out for the moment. - -Implementation inspired from here (Jonas Steinebach, MIT): -https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_pooling.jl -=# - -function NNlib.meanpool!(output::Array{T,4}, input::Array{T,4}, pdims::PoolDims; kw...) where {T<:Real} - # storing all the necessary shapes - input_width, input_height, channels, current_batch_size = size(input) - output_width, output_height, channels, current_batch_size = size(output) - kernel_width, kernel_height = pdims.kernel_size - - x_stride, y_stride = pdims.stride - x_dilation, y_dilation = pdims.dilation - x_pad1, x_pad2, y_pad1, y_pad2 = pdims.padding - - # A helper function to project from output (w, h) to input (input_w, input_h) - @inline project(idx, stride, pad) = (idx - 1) * stride - pad + 1 - - # We use calc_padding_regions to split outselves up into separate regions that may or - # may not need to worry about padding: - pdims_3d = PoolDims((input_width, input_height, 1, channels, current_batch_size), (kernel_width, kernel_height, 1), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1)) - # println(pdims_3d.padding) - padded_regions, central_region = NNlib.calc_padding_regions(pdims_3d) - - # We represent division by kernel size by rolling it - # into the `alpha` multiplier. - _alpha = T(1 / prod(pdims.kernel_size)) - - # Start with the central region - w_region, h_region, _ = central_region - - if pdims.stride == (1, 1) && pdims.dilation == (1, 1) # specialized case for better performance - - @tturbo for index_batch in 1:current_batch_size - # compute pooling for each channel separatly - for channel in 1:channels, y_out in h_region, x_out in w_region - kernel_sum = zero(T) - for y_w in 1:kernel_height, x_w in 1:kernel_width - # kernel_sum += input[x_out + x_w - 1, y_out + y_w - 1, channel, index_batch] - kernel_sum += input[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, channel, index_batch] - end - output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha - end - end - - else # general case for any meanpooling - - @tturbo for index_batch in 1:current_batch_size - # compute pooling for each channel separatly - for channel in 1:channels, y_out in h_region, x_out in w_region - m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1 - n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1 - kernel_sum = zero(T) - for y_w in 1:kernel_height, x_w in 1:kernel_width - y_in = m + (y_w - 1) * y_dilation # - y_pad1 - x_in = n + (x_w - 1) * x_dilation # - x_pad1 - kernel_sum += input[x_in, y_in, channel, index_batch] - end - output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha - end - end - - end - - # Next, the padded regions - @inbounds for (w_region, h_region, d_region) in padded_regions - for index_batch in 1:current_batch_size, channel in 1:channels - for d in d_region # for skipping the d_regions - for h in h_region - ph = project(h, y_stride, y_pad1) - for w in w_region - pw = project(w, x_stride, x_pad1) - m = zero(T) - - for kh in 1:kernel_height - input_kh = ph + (kh - 1) * y_dilation - if input_kh <= 0 || input_kh > input_height - continue - end - - for kw in 1:kernel_width - input_kw = pw + (kw - 1) * x_dilation - if input_kw <= 0 || input_kw > input_width - continue - end - - m += input[input_kw, input_kh, channel, index_batch] - end - end - - output[w, h, channel, index_batch] = _alpha * m - end - end - end - end - end - - return output -end - -function NNlib.∇meanpool!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, output::Array{T,4}, input::Array{T,4}, pdims::PoolDims; kw...) where {T<:Real} - # storing all the necessary shapes - input_width, input_height, channels, current_batch_size = size(input_gradient) - output_width, output_height, channels, current_batch_size = size(output_gradient) - kernel_width, kernel_height = pdims.kernel_size - - # because in the actual computation section, values are added, it's saver to reset the given input_gradient first - input_gradient .= zero(T) - - x_stride, y_stride = pdims.stride - x_dilation, y_dilation = pdims.dilation - x_pad1, x_pad2, y_pad1, y_pad2 = pdims.padding - - # A helper function to project from output (w, h) to input (input_w, input_h) - @inline project(idx, stride, pad) = (idx - 1) * stride - pad + 1 - - # We use calc_padding_regions to split outselves up into separate regions that may or - # may not need to worry about padding: - pdims_3d = PoolDims((input_width, input_height, 1, channels, current_batch_size), (kernel_width, kernel_height, 1), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1)) - # println(pdims_3d.padding) - padded_regions, central_region = NNlib.calc_padding_regions(pdims_3d) - - # We represent division by kernel size by rolling it - # into the `alpha` multiplier. - _alpha = T(1 / prod(pdims.kernel_size)) - - # Start with the central region - w_region, h_region, _ = central_region - - if pdims.stride == (1, 1) && pdims.dilation == (1, 1) # specialized case for better performance - - @tturbo for index_batch in 1:current_batch_size - # compute pooling for each channel separatly - for channel in 1:channels, y_out in h_region, x_out in w_region - output_gradient_value = output_gradient[x_out, y_out, channel, index_batch] - for y_w in 1:kernel_height, x_w in 1:kernel_width - input_gradient[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, channel, index_batch] += output_gradient_value * _alpha - end - end - end - - else # general case for any meanpooling - - @tturbo for index_batch in 1:current_batch_size - # compute pooling for each channel separatly - for channel in 1:channels, y_out in h_region, x_out in w_region - output_gradient_value = output_gradient[x_out, y_out, channel, index_batch] - m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1 - n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1 - for y_w in 1:kernel_height, x_w in 1:kernel_width - y_in = m + (y_w - 1) * y_dilation # - y_pad1 - x_in = n + (x_w - 1) * x_dilation # - x_pad1 - input_gradient[x_in, y_in, channel, index_batch] += output_gradient_value * _alpha - end - end - end - - end - - # Next, the padded regions - @inbounds for (w_region, h_region, d_region) in padded_regions - for index_batch in 1:current_batch_size, channel in 1:channels - for d in d_region # for skipping the d_regions - for h in h_region - ph = project(h, y_stride, y_pad1) - for w in w_region - pw = project(w, x_stride, x_pad1) - - # Grab the incoming gradient at this index for future use - output_gradient_value = output_gradient[w, h, channel, index_batch] - - for kh in 1:kernel_height - input_kh = ph + (kh - 1) * y_dilation - if input_kh <= 0 || input_kh > input_height - continue - end - - for kw in 1:kernel_width - input_kw = pw + (kw - 1) * x_dilation - if input_kw <= 0 || input_kw > input_width - continue - end - - input_gradient[input_kw, input_kh, channel, index_batch] += output_gradient_value * _alpha - end - end - - end - end - end - end - end - - return input_gradient -end \ No newline at end of file From 5339aafa8ed56494c50a83a7767e95e7f7be79a7 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Tue, 26 Sep 2023 20:57:34 +0200 Subject: [PATCH 04/37] Add files via upload --- test/ext_loopvectorization/runtests.jl | 71 ++++++++++++++++++++++++++ test/runtests.jl | 17 +++++- 2 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 test/ext_loopvectorization/runtests.jl diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl new file mode 100644 index 000000000..05157a8bc --- /dev/null +++ b/test/ext_loopvectorization/runtests.jl @@ -0,0 +1,71 @@ +using NNlib, Test + +function compute_conv_outputs(settings::Vector{<:NamedTuple}, input::Array{T,4}, weight_ungrouped::Array{T,4}, weight_grouped::Array{T,4}) where {T<:Real} + conv_outs = Vector{Array{T, 4}}(undef, length(settings)) + conv_grads = Vector{Array{T, 4}}(undef, length(settings)) + + for (i, setting) in enumerate(settings) + if setting.groups > 1 + weight = weight_grouped + else + weight = weight_ungrouped + end + + cdims = NNlib.DenseConvDims(size(input), size(weight), stride = setting.stride, padding = setting.padding, dilation = setting.dilation, groups = setting.groups) + + out = NNlib.conv(input, weight, cdims) + output_gradient = ones(T, size(out)) + + conv_grads[i] = NNlib.∇conv_data(output_gradient, weight, cdims) + conv_outs[i] = out + end + + return conv_outs, conv_grads +end + +function compute_pool_outputs(settings::Vector{<:NamedTuple}, input::Array{T,4}) where {T<:Real} + pool_outs = Vector{Array{T, 4}}(undef, length(settings)) + + for (i, setting) in enumerate(settings) + pdims = NNlib.PoolDims(size(input), setting.kernel_size, stride = setting.stride, padding = setting.padding, dilation = setting.dilation) + pool_outs[i] = NNlib.meanpool(input, pdims) + end + + return pool_outs +end + +@testset "Convolution & Pooling" begin + + dtype = Float32 + input = rand(dtype, 224, 224, 3, 64) # for conv & pool + weight_ungrouped = rand(dtype, 5, 5, 3, 27) # for conv + weight_grouped = rand(dtype, 5, 5, 1, 27) # for grouped conv + + conv_settings_list = [ + (; stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1), # test 'very specialized case' + (; stride=(2, 1), padding=(2, 0), dilation=(2, 1), groups=1), # test 'second specialized case' + (; stride=(2, 1), padding=(2, 0), dilation=(2, 0), groups=3), # test 'general case' + ] + + pool_settings_list = [ + (; kernel_size=(5, 4), stride=(1, 1), padding=(0, 0), dilation=(1, 1)), # test 'specialized case' + (; kernel_size=(5, 4), stride=(2, 1), padding=(2, 0), dilation=(2, 1)), # test 'general case' + ] + + # compute outputs before loading LoopVectorization + + conv_outs_std, conv_grads_std = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped) + pool_outs_std = compute_pool_outputs(pool_settings_list, input) + + using LoopVectorization # now load the NNlibLoopVectorizationExt + + conv_outs_lv, conv_grads_lv = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped) + pool_outs_lv = compute_pool_outputs(pool_settings_list, input) + + # validate conv + @test all(isapprox.(conv_outs_std, conv_outs_lv)) + @test all(isapprox.(conv_grads_std, conv_grads_lv)) + # validate pool + @test all(isapprox.(pool_outs_std, pool_outs_lv)) + +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 03602a40d..5266b5f44 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -15,7 +15,8 @@ DocMeta.setdocmeta!(NNlib, :DocTestSetup, :(using NNlib, UnicodePlots); recursiv # ENV["NNLIB_TEST_CUDA"] = "true" # uncomment to run CUDA tests # ENV["NNLIB_TEST_AMDGPU"] = "true" # uncomment to run AMDGPU tests -# ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests +# ENV["NNLIB_TEST_LOOPVECTORIZATION"] = "false" # uncomment to skip LoopVectorization tests +ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests const rng = StableRNG(123) include("test_utils.jl") @@ -155,4 +156,18 @@ end else @info "Skipping AMDGPU tests, set NNLIB_TEST_AMDGPU=true to run them." end + + if get(ENV, "NNLIB_TEST_LOOPVECTORIZATION", "true") == "true" + @testset "CPU" begin + # Don't import LoopVectorization here! + # Because the LV-impls are simply tested against NNlib's standard CPU impls, + # importing LoopVectorization here would load NNlibLoopVectorizationExt too early! + @testset "LoopVectorization" begin + include("ext_loopvectorization/runtests.jl") + end + end + else + @info "Skipping LoopVectorization tests, set NNLIB_TEST_LOOPVECTORIZATION=true to run them." + end + end From 9e0dc6d681c545cb3ff30b747aa384bcba29b691 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Tue, 26 Sep 2023 20:57:47 +0200 Subject: [PATCH 05/37] Add files via upload --- Project.toml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index b33b90564..49ef5d07f 100644 --- a/Project.toml +++ b/Project.toml @@ -18,11 +18,13 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" [extensions] NNlibAMDGPUExt = "AMDGPU" NNlibCUDAExt = "CUDA" NNlibCUDACUDNNExt = ["CUDA", "cuDNN"] +NNlibLoopVectorizationExt = "LoopVectorization" [compat] AMDGPU = "0.5, 0.6" @@ -35,6 +37,7 @@ GPUArraysCore = "0.1" KernelAbstractions = "0.9.2" Requires = "1.0" julia = "1.9" +# LoopVectorization = "=0.12.146" [extras] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" @@ -44,6 +47,7 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" +LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" @@ -53,5 +57,5 @@ cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" [targets] test = ["AMDGPU", "CUDA", "ChainRulesTestUtils", "Documenter", - "FiniteDifferences", "ForwardDiff", "Logging", "ReverseDiff", - "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN"] + "FiniteDifferences", "ForwardDiff", "Logging", "LoopVectorization", + "ReverseDiff", "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN"] From dd0f0ed866a6d2ec12b865f4ccd7f0f1670d3db9 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Tue, 26 Sep 2023 20:58:58 +0200 Subject: [PATCH 06/37] Add files via upload --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 5266b5f44..2a44d8d3e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -16,7 +16,7 @@ DocMeta.setdocmeta!(NNlib, :DocTestSetup, :(using NNlib, UnicodePlots); recursiv # ENV["NNLIB_TEST_CUDA"] = "true" # uncomment to run CUDA tests # ENV["NNLIB_TEST_AMDGPU"] = "true" # uncomment to run AMDGPU tests # ENV["NNLIB_TEST_LOOPVECTORIZATION"] = "false" # uncomment to skip LoopVectorization tests -ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests +# ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests const rng = StableRNG(123) include("test_utils.jl") From b341d1cd81a968fccd5739fa40bf6c85ea330e82 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Tue, 26 Sep 2023 23:16:41 +0200 Subject: [PATCH 07/37] Add files via upload --- runtests.jl | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 runtests.jl diff --git a/runtests.jl b/runtests.jl new file mode 100644 index 000000000..5266b5f44 --- /dev/null +++ b/runtests.jl @@ -0,0 +1,173 @@ +using NNlib, Test, Statistics, Random +using ChainRulesCore, ChainRulesTestUtils +using Base.Broadcast: broadcasted +import FiniteDifferences +import ForwardDiff +import Zygote +using Zygote: gradient +using StableRNGs +using Documenter +using Adapt +using KernelAbstractions +import ReverseDiff as RD # used in `pooling.jl` + +DocMeta.setdocmeta!(NNlib, :DocTestSetup, :(using NNlib, UnicodePlots); recursive=true) + +# ENV["NNLIB_TEST_CUDA"] = "true" # uncomment to run CUDA tests +# ENV["NNLIB_TEST_AMDGPU"] = "true" # uncomment to run AMDGPU tests +# ENV["NNLIB_TEST_LOOPVECTORIZATION"] = "false" # uncomment to skip LoopVectorization tests +ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests + +const rng = StableRNG(123) +include("test_utils.jl") + +macro conditional_testset(name, skip_tests, expr) + esc(quote + @testset $name begin + if $name ∉ $skip_tests + $expr + else + @test_skip false + end + end + end) +end + +cpu(x) = adapt(CPU(), x) + +include("gather.jl") +include("scatter.jl") +include("upsample.jl") + +function nnlib_testsuite(Backend; skip_tests = Set{String}()) + @conditional_testset "Upsample" skip_tests begin + upsample_testsuite(Backend) + end + @conditional_testset "Gather" skip_tests begin + gather_testsuite(Backend) + end + @conditional_testset "Scatter" skip_tests begin + scatter_testsuite(Backend) + end +end + +@testset verbose=true "NNlib.jl" begin + + if get(ENV, "NNLIB_TEST_CPU", "true") == "true" + @testset "CPU" begin + @testset "Doctests" begin + doctest(NNlib, manual=false) + end + + nnlib_testsuite(CPU) + + @testset "Activation Functions" begin + include("activations.jl") + include("bias_act.jl") + end + + @testset "Attention" begin + include("attention.jl") + end + + @testset "Batched Multiplication" begin + include("batchedmul.jl") + end + + @testset "Convolution" begin + include("conv.jl") + include("conv_bias_act.jl") + end + + @testset "CTC Loss" begin + include("ctc.jl") + end + + @testset "Dropout" begin + include("dropout.jl") + end + + @testset "Fold/Unfold" begin + include("fold.jl") + end + + @testset "Inference" begin + include("inference.jl") + end + + @testset "Pooling" begin + include("pooling.jl") + end + + @testset "Padding" begin + include("padding.jl") + end + + @testset "Softmax" begin + include("softmax.jl") + end + + @testset "Utilities" begin + include("utils.jl") + end + + @testset "Grid Sampling" begin + include("sampling.jl") + end + + @testset "Functions" begin + include("functions.jl") + end + end + else + @info "Skipping CPU tests, set NNLIB_TEST_CPU=true to run them." + end + + if get(ENV, "NNLIB_TEST_CUDA", "false") == "true" + using CUDA + if CUDA.functional() + @testset "CUDA" begin + nnlib_testsuite(CUDABackend; skip_tests=Set(("Scatter", "Gather"))) + + include("ext_cuda/runtests.jl") + end + else + @info "Insufficient version or CUDA not found; Skipping CUDA tests" + end + else + @info "Skipping CUDA tests, set NNLIB_TEST_CUDA=true to run them" + end + + if get(ENV, "NNLIB_TEST_AMDGPU", "false") == "true" + using AMDGPU + AMDGPU.versioninfo() + if AMDGPU.functional() && AMDGPU.functional(:MIOpen) + @show AMDGPU.MIOpen.version() + @testset "AMDGPU" begin + nnlib_testsuite(ROCBackend) + AMDGPU.synchronize(; blocking=false) + + include("ext_amdgpu/runtests.jl") + AMDGPU.synchronize(; blocking=false) + end + else + @info "AMDGPU.jl package is not functional. Skipping AMDGPU tests." + end + else + @info "Skipping AMDGPU tests, set NNLIB_TEST_AMDGPU=true to run them." + end + + if get(ENV, "NNLIB_TEST_LOOPVECTORIZATION", "true") == "true" + @testset "CPU" begin + # Don't import LoopVectorization here! + # Because the LV-impls are simply tested against NNlib's standard CPU impls, + # importing LoopVectorization here would load NNlibLoopVectorizationExt too early! + @testset "LoopVectorization" begin + include("ext_loopvectorization/runtests.jl") + end + end + else + @info "Skipping LoopVectorization tests, set NNLIB_TEST_LOOPVECTORIZATION=true to run them." + end + +end From 94f79642a2271ecb6c83a544e13a7aeb9245468c Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Tue, 26 Sep 2023 23:26:30 +0200 Subject: [PATCH 08/37] Update runtests.jl --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 2a44d8d3e..5266b5f44 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -16,7 +16,7 @@ DocMeta.setdocmeta!(NNlib, :DocTestSetup, :(using NNlib, UnicodePlots); recursiv # ENV["NNLIB_TEST_CUDA"] = "true" # uncomment to run CUDA tests # ENV["NNLIB_TEST_AMDGPU"] = "true" # uncomment to run AMDGPU tests # ENV["NNLIB_TEST_LOOPVECTORIZATION"] = "false" # uncomment to skip LoopVectorization tests -# ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests +ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests const rng = StableRNG(123) include("test_utils.jl") From 6cc2e7516a5ea2b1046ba9b3a9f9bd4de8e6306c Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Wed, 27 Sep 2023 19:22:04 +0200 Subject: [PATCH 09/37] Add files via upload --- test/ext_loopvectorization/runtests.jl | 39 ++++++++++++++------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl index 05157a8bc..9c4d82b75 100644 --- a/test/ext_loopvectorization/runtests.jl +++ b/test/ext_loopvectorization/runtests.jl @@ -1,34 +1,32 @@ -using NNlib, Test +using NNlib, Test, BenchmarkTools -function compute_conv_outputs(settings::Vector{<:NamedTuple}, input::Array{T,4}, weight_ungrouped::Array{T,4}, weight_grouped::Array{T,4}) where {T<:Real} +function compute_conv_outputs(settings::Vector{<:NNlib.ConvDims}, input::Array{T,4}, weight_ungrouped::Array{T,4}, weight_grouped::Array{T,4}, conv_output_grads::Vector{Array{T,4}}) where {T<:Real} conv_outs = Vector{Array{T, 4}}(undef, length(settings)) conv_grads = Vector{Array{T, 4}}(undef, length(settings)) for (i, setting) in enumerate(settings) - if setting.groups > 1 + if setting.groupcount > 1 weight = weight_grouped else weight = weight_ungrouped end - cdims = NNlib.DenseConvDims(size(input), size(weight), stride = setting.stride, padding = setting.padding, dilation = setting.dilation, groups = setting.groups) + out = @btime NNlib.conv($input, $weight, $setting) + output_gradient = conv_output_grads[i] - out = NNlib.conv(input, weight, cdims) - output_gradient = ones(T, size(out)) - - conv_grads[i] = NNlib.∇conv_data(output_gradient, weight, cdims) + conv_grads[i] = @btime NNlib.∇conv_data($output_gradient, $weight, $setting) conv_outs[i] = out end return conv_outs, conv_grads end -function compute_pool_outputs(settings::Vector{<:NamedTuple}, input::Array{T,4}) where {T<:Real} +function compute_pool_outputs(settings::Vector{<:NNlib.PoolDims}, input::Array{T,4}) where {T<:Real} pool_outs = Vector{Array{T, 4}}(undef, length(settings)) for (i, setting) in enumerate(settings) pdims = NNlib.PoolDims(size(input), setting.kernel_size, stride = setting.stride, padding = setting.padding, dilation = setting.dilation) - pool_outs[i] = NNlib.meanpool(input, pdims) + pool_outs[i] = @btime NNlib.meanpool($input, $pdims) end return pool_outs @@ -37,29 +35,34 @@ end @testset "Convolution & Pooling" begin dtype = Float32 - input = rand(dtype, 224, 224, 3, 64) # for conv & pool + batch_size = 64 + input = rand(dtype, 224, 224, 3, batch_size) # for conv & pool weight_ungrouped = rand(dtype, 5, 5, 3, 27) # for conv weight_grouped = rand(dtype, 5, 5, 1, 27) # for grouped conv conv_settings_list = [ - (; stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1), # test 'very specialized case' - (; stride=(2, 1), padding=(2, 0), dilation=(2, 1), groups=1), # test 'second specialized case' - (; stride=(2, 1), padding=(2, 0), dilation=(2, 0), groups=3), # test 'general case' + NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1), # test 'very specialized case' + NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(2, 1), padding=(0, 0), dilation=(2, 1), groups=1), # test 'second specialized case' + NNlib.DenseConvDims(size(input), size(weight_grouped), stride=(2, 1), padding=(2, 0), dilation=(2, 1), groups=3), # test 'general case' ] + conv_output_grads = [rand(dtype, NNlib.output_size(setting)..., 27, batch_size) for setting in conv_settings_list] + pool_settings_list = [ - (; kernel_size=(5, 4), stride=(1, 1), padding=(0, 0), dilation=(1, 1)), # test 'specialized case' - (; kernel_size=(5, 4), stride=(2, 1), padding=(2, 0), dilation=(2, 1)), # test 'general case' + NNlib.PoolDims(size(input), (5, 4), stride=(1, 1), padding=(0, 0), dilation=(1, 1)), # test 'specialized case' + NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 0), dilation=(2, 1)), # test 'general case' ] # compute outputs before loading LoopVectorization - conv_outs_std, conv_grads_std = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped) + println("without LoopVectorization") + conv_outs_std, conv_grads_std = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads) pool_outs_std = compute_pool_outputs(pool_settings_list, input) using LoopVectorization # now load the NNlibLoopVectorizationExt - conv_outs_lv, conv_grads_lv = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped) + println("with LoopVectorization") + conv_outs_lv, conv_grads_lv = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads) pool_outs_lv = compute_pool_outputs(pool_settings_list, input) # validate conv From c5c79ee71f6c7c9b58c3a339c219557ce3ee3c90 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Wed, 27 Sep 2023 19:28:37 +0200 Subject: [PATCH 10/37] Add files via upload --- Project.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 49ef5d07f..e1794c43f 100644 --- a/Project.toml +++ b/Project.toml @@ -41,6 +41,7 @@ julia = "1.9" [extras] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" @@ -56,6 +57,6 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" [targets] -test = ["AMDGPU", "CUDA", "ChainRulesTestUtils", "Documenter", +test = ["AMDGPU", "BenchmarkTools", "CUDA", "ChainRulesTestUtils", "Documenter", "FiniteDifferences", "ForwardDiff", "Logging", "LoopVectorization", "ReverseDiff", "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN"] From 132e35cc5608f7de900ba42a2112bdfe74a3fe44 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Wed, 27 Sep 2023 19:59:01 +0200 Subject: [PATCH 11/37] Add files via upload --- test/ext_loopvectorization/runtests.jl | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl index 9c4d82b75..f94b22acd 100644 --- a/test/ext_loopvectorization/runtests.jl +++ b/test/ext_loopvectorization/runtests.jl @@ -34,7 +34,7 @@ end @testset "Convolution & Pooling" begin - dtype = Float32 + dtype = Float64 # Float32 batch_size = 64 input = rand(dtype, 224, 224, 3, batch_size) # for conv & pool weight_ungrouped = rand(dtype, 5, 5, 3, 27) # for conv @@ -67,8 +67,22 @@ end # validate conv @test all(isapprox.(conv_outs_std, conv_outs_lv)) - @test all(isapprox.(conv_grads_std, conv_grads_lv)) + # @test all(isapprox.(conv_grads_std, conv_grads_lv)) # seems to be wrong on some CI devices, reason unknown # validate pool @test all(isapprox.(pool_outs_std, pool_outs_lv)) + @info isapprox(conv_grads_std[1], conv_grads_lv[1]) + @info isapprox(conv_grads_std[2], conv_grads_lv[2]) + @info isapprox(conv_grads_std[3], conv_grads_lv[3]) + + @testset "Conv impl 1" begin + @test isapprox(conv_grads_std[1], conv_grads_lv[1]) + end + @testset "Conv impl 2" begin + @test isapprox(conv_grads_std[2], conv_grads_lv[2]) + end + @testset "Conv impl 3" begin + @test isapprox(conv_grads_std[3], conv_grads_lv[3]) + end + end \ No newline at end of file From aa019e9c4d9dfce95d62a68748096a0e54e02d74 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Wed, 27 Sep 2023 20:26:23 +0200 Subject: [PATCH 12/37] Add files via upload --- test/ext_loopvectorization/runtests.jl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl index f94b22acd..12bb84204 100644 --- a/test/ext_loopvectorization/runtests.jl +++ b/test/ext_loopvectorization/runtests.jl @@ -72,8 +72,13 @@ end @test all(isapprox.(pool_outs_std, pool_outs_lv)) @info isapprox(conv_grads_std[1], conv_grads_lv[1]) + println(sum(conv_grads_std[1])); println(sum(conv_grads_lv[1])) + @info isapprox(conv_grads_std[2], conv_grads_lv[2]) + println(sum(conv_grads_std[2])); println(sum(conv_grads_lv[2])) + @info isapprox(conv_grads_std[3], conv_grads_lv[3]) + println(sum(conv_grads_std[3])); println(sum(conv_grads_lv[3])) @testset "Conv impl 1" begin @test isapprox(conv_grads_std[1], conv_grads_lv[1]) From 52e2a78a9a443c7aaebfc4e771baf730a95c5367 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Wed, 27 Sep 2023 21:07:09 +0200 Subject: [PATCH 13/37] Add files via upload --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index e1794c43f..68ea1f946 100644 --- a/Project.toml +++ b/Project.toml @@ -37,7 +37,7 @@ GPUArraysCore = "0.1" KernelAbstractions = "0.9.2" Requires = "1.0" julia = "1.9" -# LoopVectorization = "=0.12.146" +LoopVectorization = "=0.12.146" [extras] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" From d63f8a53f82d5ba10ac3450feadb38d37121de6e Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Thu, 28 Sep 2023 20:05:25 +0200 Subject: [PATCH 14/37] Add files via upload --- test/ext_loopvectorization/minimal_test.jl | 81 ++++++++++++++++++++++ test/ext_loopvectorization/runtests.jl | 8 ++- 2 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 test/ext_loopvectorization/minimal_test.jl diff --git a/test/ext_loopvectorization/minimal_test.jl b/test/ext_loopvectorization/minimal_test.jl new file mode 100644 index 000000000..a40ff985c --- /dev/null +++ b/test/ext_loopvectorization/minimal_test.jl @@ -0,0 +1,81 @@ +using NNlib, LoopVectorization + +function ∇conv_data!_avx(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} + + NNlib.check_dims(size(input_gradient), size(weight), size(output_gradient), cdims) + + # storing all the necessary shapes + output_width, output_height, out_channels, current_batch_size = size(output_gradient) + weight_width, weight_height, in_channels_weight, out_channels = size(weight) + input_width, input_height, in_channels, current_batch_size = size(input_gradient) + + if cdims.padding != (0, 0, 0, 0) || cdims.groupcount != 1 || cdims.stride != (1, 1) || cdims.dilation != (1, 1) + throw(ArgumentError("this test function only supports basic conv (or crosscor) bwd with pad=0, stride=1, dilation=1, groups=1")) + end + + # it's necessary to flip the kernel if real convolution is performed (flipkernel=false) + if !NNlib.flipkernel(cdims) + weight = reverse(weight, dims=(1, 2)) + end + + Threads.@threads for index_batch in 1:current_batch_size + @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + input_gradient[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] + end + end + end + + return input_gradient +end + +function ∇conv_data!_noavx(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} + + NNlib.check_dims(size(input_gradient), size(weight), size(output_gradient), cdims) + + # storing all the necessary shapes + output_width, output_height, out_channels, current_batch_size = size(output_gradient) + weight_width, weight_height, in_channels_weight, out_channels = size(weight) + input_width, input_height, in_channels, current_batch_size = size(input_gradient) + + if cdims.padding != (0, 0, 0, 0) || cdims.groupcount != 1 || cdims.stride != (1, 1) || cdims.dilation != (1, 1) + throw(ArgumentError("this test function only supports basic conv (or crosscor) bwd with pad=0, stride=1, dilation=1, groups=1")) + end + + # it's necessary to flip the kernel if real convolution is performed (flipkernel=false) + if !NNlib.flipkernel(cdims) + weight = reverse(weight, dims=(1, 2)) + end + + for index_batch in 1:current_batch_size # NO @threads here + for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width # NO @turbo here! + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + input_gradient[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] + end + end + end + + return input_gradient +end + +dtype = Float32 # Float64 +batch_size = 5 +input = rand(dtype, 50, 50, 3, batch_size) +weight = rand(dtype, 5, 5, 3, 9) +cdims = NNlib.DenseConvDims(size(input), size(weight), stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1) +output_gradient = rand(dtype, NNlib.output_size(cdims)..., 9, batch_size) + +input_gradient_noavx = zeros(dtype, size(input)...) +input_gradient_noavx = ∇conv_data!_noavx(input_gradient_noavx, output_gradient, weight, cdims) +input_gradient_noavx = @time ∇conv_data!_noavx(input_gradient_noavx, output_gradient, weight, cdims) + +input_gradient_avx = zeros(dtype, size(input)...) +input_gradient_avx = ∇conv_data!_avx(input_gradient_avx, output_gradient, weight, cdims) +input_gradient_avx = @time ∇conv_data!_avx(input_gradient_avx, output_gradient, weight, cdims) + +@info isapprox(input_gradient_noavx, input_gradient_avx) +@testset "conv bwd minimal" begin + @test isapprox(input_gradient_noavx, input_gradient_avx) +end +@show sum(input_gradient_noavx) +@show sum(input_gradient_avx) \ No newline at end of file diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl index 12bb84204..c715bb2f0 100644 --- a/test/ext_loopvectorization/runtests.jl +++ b/test/ext_loopvectorization/runtests.jl @@ -35,7 +35,7 @@ end @testset "Convolution & Pooling" begin dtype = Float64 # Float32 - batch_size = 64 + batch_size = 32 # 64 input = rand(dtype, 224, 224, 3, batch_size) # for conv & pool weight_ungrouped = rand(dtype, 5, 5, 3, 27) # for conv weight_grouped = rand(dtype, 5, 5, 1, 27) # for grouped conv @@ -55,12 +55,15 @@ end # compute outputs before loading LoopVectorization + #= println("without LoopVectorization") conv_outs_std, conv_grads_std = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads) pool_outs_std = compute_pool_outputs(pool_settings_list, input) + =# using LoopVectorization # now load the NNlibLoopVectorizationExt + #= println("with LoopVectorization") conv_outs_lv, conv_grads_lv = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads) pool_outs_lv = compute_pool_outputs(pool_settings_list, input) @@ -89,5 +92,8 @@ end @testset "Conv impl 3" begin @test isapprox(conv_grads_std[3], conv_grads_lv[3]) end + =# + + include("minimal_test.jl") end \ No newline at end of file From ae86d13c6523282398358cb484ccfabcd721152d Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Thu, 28 Sep 2023 20:49:17 +0200 Subject: [PATCH 15/37] Add files via upload --- test/ext_loopvectorization/minimal_test.jl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/ext_loopvectorization/minimal_test.jl b/test/ext_loopvectorization/minimal_test.jl index a40ff985c..e3750b862 100644 --- a/test/ext_loopvectorization/minimal_test.jl +++ b/test/ext_loopvectorization/minimal_test.jl @@ -18,7 +18,10 @@ function ∇conv_data!_avx(input_gradient::Array{T,4}, output_gradient::Array{T, weight = reverse(weight, dims=(1, 2)) end - Threads.@threads for index_batch in 1:current_batch_size + # because in the actual computation section, values are added, it's saver to reset the given input_gradient first + input_gradient .= zero(T) + + for index_batch in 1:current_batch_size # Threads.@threads @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width input_gradient[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] @@ -47,6 +50,9 @@ function ∇conv_data!_noavx(input_gradient::Array{T,4}, output_gradient::Array{ weight = reverse(weight, dims=(1, 2)) end + # because in the actual computation section, values are added, it's saver to reset the given input_gradient first + input_gradient .= zero(T) + for index_batch in 1:current_batch_size # NO @threads here for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width # NO @turbo here! for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width From 776835d6e64a091bf6b06b0170912d4992096d21 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Thu, 28 Sep 2023 20:59:19 +0200 Subject: [PATCH 16/37] Add files via upload --- test/runtests.jl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 5266b5f44..c78c74bcd 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -158,13 +158,11 @@ end end if get(ENV, "NNLIB_TEST_LOOPVECTORIZATION", "true") == "true" - @testset "CPU" begin + @testset "LoopVectorization" begin # Don't import LoopVectorization here! # Because the LV-impls are simply tested against NNlib's standard CPU impls, # importing LoopVectorization here would load NNlibLoopVectorizationExt too early! - @testset "LoopVectorization" begin - include("ext_loopvectorization/runtests.jl") - end + include("ext_loopvectorization/runtests.jl") end else @info "Skipping LoopVectorization tests, set NNLIB_TEST_LOOPVECTORIZATION=true to run them." From 13205da155cceb8f5f708041897dbf3543f57ce8 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Thu, 28 Sep 2023 21:11:15 +0200 Subject: [PATCH 17/37] Add files via upload --- Project.toml | 5 +++-- test/ext_loopvectorization/minimal_test.jl | 4 +++- test/ext_loopvectorization/runtests.jl | 10 ++++------ 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/Project.toml b/Project.toml index 68ea1f946..29e948b6f 100644 --- a/Project.toml +++ b/Project.toml @@ -37,13 +37,14 @@ GPUArraysCore = "0.1" KernelAbstractions = "0.9.2" Requires = "1.0" julia = "1.9" -LoopVectorization = "=0.12.146" +# LoopVectorization = "=0.12.146" [extras] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a" +CpuId = "adafc99b-e345-5852-983c-f28acb93d879" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" @@ -57,6 +58,6 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" [targets] -test = ["AMDGPU", "BenchmarkTools", "CUDA", "ChainRulesTestUtils", "Documenter", +test = ["AMDGPU", "BenchmarkTools", "CUDA", "ChainRulesTestUtils", "CpuId", "Documenter", "FiniteDifferences", "ForwardDiff", "Logging", "LoopVectorization", "ReverseDiff", "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN"] diff --git a/test/ext_loopvectorization/minimal_test.jl b/test/ext_loopvectorization/minimal_test.jl index e3750b862..c081a8fb1 100644 --- a/test/ext_loopvectorization/minimal_test.jl +++ b/test/ext_loopvectorization/minimal_test.jl @@ -1,4 +1,4 @@ -using NNlib, LoopVectorization +using NNlib, LoopVectorization, CpuId function ∇conv_data!_avx(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} @@ -64,6 +64,8 @@ function ∇conv_data!_noavx(input_gradient::Array{T,4}, output_gradient::Array{ return input_gradient end +println(cpuinfo()) + dtype = Float32 # Float64 batch_size = 5 input = rand(dtype, 50, 50, 3, batch_size) diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl index c715bb2f0..932a04a57 100644 --- a/test/ext_loopvectorization/runtests.jl +++ b/test/ext_loopvectorization/runtests.jl @@ -34,8 +34,8 @@ end @testset "Convolution & Pooling" begin - dtype = Float64 # Float32 - batch_size = 32 # 64 + dtype = Float32 # Float64 + batch_size = 64 # 32 input = rand(dtype, 224, 224, 3, batch_size) # for conv & pool weight_ungrouped = rand(dtype, 5, 5, 3, 27) # for conv weight_grouped = rand(dtype, 5, 5, 1, 27) # for grouped conv @@ -53,17 +53,15 @@ end NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 0), dilation=(2, 1)), # test 'general case' ] - # compute outputs before loading LoopVectorization - #= + # compute outputs before loading LoopVectorization + println("without LoopVectorization") conv_outs_std, conv_grads_std = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads) pool_outs_std = compute_pool_outputs(pool_settings_list, input) - =# using LoopVectorization # now load the NNlibLoopVectorizationExt - #= println("with LoopVectorization") conv_outs_lv, conv_grads_lv = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads) pool_outs_lv = compute_pool_outputs(pool_settings_list, input) From 5850341872f5dc2670b5648bf2a4556f55ddb87b Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Thu, 28 Sep 2023 21:24:09 +0200 Subject: [PATCH 18/37] Add files via upload --- test/ext_loopvectorization/minimal_test.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/ext_loopvectorization/minimal_test.jl b/test/ext_loopvectorization/minimal_test.jl index c081a8fb1..b401efe93 100644 --- a/test/ext_loopvectorization/minimal_test.jl +++ b/test/ext_loopvectorization/minimal_test.jl @@ -66,7 +66,7 @@ end println(cpuinfo()) -dtype = Float32 # Float64 +dtype = Float64 # Float32 batch_size = 5 input = rand(dtype, 50, 50, 3, batch_size) weight = rand(dtype, 5, 5, 3, 9) From 00b28f2ca570553148dd846cb6b5bf4f69545243 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Thu, 28 Sep 2023 21:34:02 +0200 Subject: [PATCH 19/37] Add files via upload --- test/ext_loopvectorization/minimal_test.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/ext_loopvectorization/minimal_test.jl b/test/ext_loopvectorization/minimal_test.jl index b401efe93..c081a8fb1 100644 --- a/test/ext_loopvectorization/minimal_test.jl +++ b/test/ext_loopvectorization/minimal_test.jl @@ -66,7 +66,7 @@ end println(cpuinfo()) -dtype = Float64 # Float32 +dtype = Float32 # Float64 batch_size = 5 input = rand(dtype, 50, 50, 3, batch_size) weight = rand(dtype, 5, 5, 3, 9) From af04cc66bd91a5b71418b9f7afacf6a53128459a Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Thu, 28 Sep 2023 21:34:36 +0200 Subject: [PATCH 20/37] Delete runtests.jl --- runtests.jl | 173 ---------------------------------------------------- 1 file changed, 173 deletions(-) delete mode 100644 runtests.jl diff --git a/runtests.jl b/runtests.jl deleted file mode 100644 index 5266b5f44..000000000 --- a/runtests.jl +++ /dev/null @@ -1,173 +0,0 @@ -using NNlib, Test, Statistics, Random -using ChainRulesCore, ChainRulesTestUtils -using Base.Broadcast: broadcasted -import FiniteDifferences -import ForwardDiff -import Zygote -using Zygote: gradient -using StableRNGs -using Documenter -using Adapt -using KernelAbstractions -import ReverseDiff as RD # used in `pooling.jl` - -DocMeta.setdocmeta!(NNlib, :DocTestSetup, :(using NNlib, UnicodePlots); recursive=true) - -# ENV["NNLIB_TEST_CUDA"] = "true" # uncomment to run CUDA tests -# ENV["NNLIB_TEST_AMDGPU"] = "true" # uncomment to run AMDGPU tests -# ENV["NNLIB_TEST_LOOPVECTORIZATION"] = "false" # uncomment to skip LoopVectorization tests -ENV["NNLIB_TEST_CPU"] = "false" # uncomment to skip CPU tests - -const rng = StableRNG(123) -include("test_utils.jl") - -macro conditional_testset(name, skip_tests, expr) - esc(quote - @testset $name begin - if $name ∉ $skip_tests - $expr - else - @test_skip false - end - end - end) -end - -cpu(x) = adapt(CPU(), x) - -include("gather.jl") -include("scatter.jl") -include("upsample.jl") - -function nnlib_testsuite(Backend; skip_tests = Set{String}()) - @conditional_testset "Upsample" skip_tests begin - upsample_testsuite(Backend) - end - @conditional_testset "Gather" skip_tests begin - gather_testsuite(Backend) - end - @conditional_testset "Scatter" skip_tests begin - scatter_testsuite(Backend) - end -end - -@testset verbose=true "NNlib.jl" begin - - if get(ENV, "NNLIB_TEST_CPU", "true") == "true" - @testset "CPU" begin - @testset "Doctests" begin - doctest(NNlib, manual=false) - end - - nnlib_testsuite(CPU) - - @testset "Activation Functions" begin - include("activations.jl") - include("bias_act.jl") - end - - @testset "Attention" begin - include("attention.jl") - end - - @testset "Batched Multiplication" begin - include("batchedmul.jl") - end - - @testset "Convolution" begin - include("conv.jl") - include("conv_bias_act.jl") - end - - @testset "CTC Loss" begin - include("ctc.jl") - end - - @testset "Dropout" begin - include("dropout.jl") - end - - @testset "Fold/Unfold" begin - include("fold.jl") - end - - @testset "Inference" begin - include("inference.jl") - end - - @testset "Pooling" begin - include("pooling.jl") - end - - @testset "Padding" begin - include("padding.jl") - end - - @testset "Softmax" begin - include("softmax.jl") - end - - @testset "Utilities" begin - include("utils.jl") - end - - @testset "Grid Sampling" begin - include("sampling.jl") - end - - @testset "Functions" begin - include("functions.jl") - end - end - else - @info "Skipping CPU tests, set NNLIB_TEST_CPU=true to run them." - end - - if get(ENV, "NNLIB_TEST_CUDA", "false") == "true" - using CUDA - if CUDA.functional() - @testset "CUDA" begin - nnlib_testsuite(CUDABackend; skip_tests=Set(("Scatter", "Gather"))) - - include("ext_cuda/runtests.jl") - end - else - @info "Insufficient version or CUDA not found; Skipping CUDA tests" - end - else - @info "Skipping CUDA tests, set NNLIB_TEST_CUDA=true to run them" - end - - if get(ENV, "NNLIB_TEST_AMDGPU", "false") == "true" - using AMDGPU - AMDGPU.versioninfo() - if AMDGPU.functional() && AMDGPU.functional(:MIOpen) - @show AMDGPU.MIOpen.version() - @testset "AMDGPU" begin - nnlib_testsuite(ROCBackend) - AMDGPU.synchronize(; blocking=false) - - include("ext_amdgpu/runtests.jl") - AMDGPU.synchronize(; blocking=false) - end - else - @info "AMDGPU.jl package is not functional. Skipping AMDGPU tests." - end - else - @info "Skipping AMDGPU tests, set NNLIB_TEST_AMDGPU=true to run them." - end - - if get(ENV, "NNLIB_TEST_LOOPVECTORIZATION", "true") == "true" - @testset "CPU" begin - # Don't import LoopVectorization here! - # Because the LV-impls are simply tested against NNlib's standard CPU impls, - # importing LoopVectorization here would load NNlibLoopVectorizationExt too early! - @testset "LoopVectorization" begin - include("ext_loopvectorization/runtests.jl") - end - end - else - @info "Skipping LoopVectorization tests, set NNLIB_TEST_LOOPVECTORIZATION=true to run them." - end - -end From 990a34c189d16bed8a6d0c52e5c6b2fc365d6f8e Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Thu, 28 Sep 2023 21:37:05 +0200 Subject: [PATCH 21/37] Delete Project.toml --- Project.toml | 63 ---------------------------------------------------- 1 file changed, 63 deletions(-) delete mode 100644 Project.toml diff --git a/Project.toml b/Project.toml deleted file mode 100644 index 29e948b6f..000000000 --- a/Project.toml +++ /dev/null @@ -1,63 +0,0 @@ -name = "NNlib" -uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" -version = "0.9.6" - -[deps] -Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458" -ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" -KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -Requires = "ae029012-a4dd-5104-9daa-d747884805df" -Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" - -[weakdeps] -AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" -cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" -LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" - -[extensions] -NNlibAMDGPUExt = "AMDGPU" -NNlibCUDAExt = "CUDA" -NNlibCUDACUDNNExt = ["CUDA", "cuDNN"] -NNlibLoopVectorizationExt = "LoopVectorization" - -[compat] -AMDGPU = "0.5, 0.6" -Adapt = "3.2" -Atomix = "0.1" -ChainRulesCore = "1.13" -CUDA = "4, 5" -cuDNN = "1" -GPUArraysCore = "0.1" -KernelAbstractions = "0.9.2" -Requires = "1.0" -julia = "1.9" -# LoopVectorization = "=0.12.146" - -[extras] -AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" -BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" -ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a" -CpuId = "adafc99b-e345-5852-983c-f28acb93d879" -Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" -FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000" -ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" -Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" -LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" -ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" -StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228" -Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" -cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" - -[targets] -test = ["AMDGPU", "BenchmarkTools", "CUDA", "ChainRulesTestUtils", "CpuId", "Documenter", - "FiniteDifferences", "ForwardDiff", "Logging", "LoopVectorization", - "ReverseDiff", "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN"] From db0ad6629e02828b618f5a00a378387d489a335f Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Thu, 28 Sep 2023 21:39:46 +0200 Subject: [PATCH 22/37] Add files via upload --- Project.toml | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 Project.toml diff --git a/Project.toml b/Project.toml new file mode 100644 index 000000000..29e948b6f --- /dev/null +++ b/Project.toml @@ -0,0 +1,63 @@ +name = "NNlib" +uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" +version = "0.9.6" + +[deps] +Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458" +ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Requires = "ae029012-a4dd-5104-9daa-d747884805df" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[weakdeps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" + +[extensions] +NNlibAMDGPUExt = "AMDGPU" +NNlibCUDAExt = "CUDA" +NNlibCUDACUDNNExt = ["CUDA", "cuDNN"] +NNlibLoopVectorizationExt = "LoopVectorization" + +[compat] +AMDGPU = "0.5, 0.6" +Adapt = "3.2" +Atomix = "0.1" +ChainRulesCore = "1.13" +CUDA = "4, 5" +cuDNN = "1" +GPUArraysCore = "0.1" +KernelAbstractions = "0.9.2" +Requires = "1.0" +julia = "1.9" +# LoopVectorization = "=0.12.146" + +[extras] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a" +CpuId = "adafc99b-e345-5852-983c-f28acb93d879" +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000" +ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" +LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" +ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228" +Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" +cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" + +[targets] +test = ["AMDGPU", "BenchmarkTools", "CUDA", "ChainRulesTestUtils", "CpuId", "Documenter", + "FiniteDifferences", "ForwardDiff", "Logging", "LoopVectorization", + "ReverseDiff", "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN"] From a4e18e6b8311afc88c9587b55954884d8cef8a42 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Fri, 29 Sep 2023 14:23:05 +0200 Subject: [PATCH 23/37] Add files via upload --- Project.toml | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/Project.toml b/Project.toml index 29e948b6f..d267c698f 100644 --- a/Project.toml +++ b/Project.toml @@ -16,26 +16,29 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [weakdeps] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" -cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" +cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" [extensions] NNlibAMDGPUExt = "AMDGPU" -NNlibCUDAExt = "CUDA" NNlibCUDACUDNNExt = ["CUDA", "cuDNN"] +NNlibCUDAExt = "CUDA" +NNlibEnzymeCoreExt = "EnzymeCore" NNlibLoopVectorizationExt = "LoopVectorization" [compat] AMDGPU = "0.5, 0.6" Adapt = "3.2" Atomix = "0.1" -ChainRulesCore = "1.13" CUDA = "4, 5" -cuDNN = "1" +ChainRulesCore = "1.13" +EnzymeCore = "0.5, 0.6" GPUArraysCore = "0.1" KernelAbstractions = "0.9.2" Requires = "1.0" +cuDNN = "1" julia = "1.9" # LoopVectorization = "=0.12.146" @@ -46,6 +49,9 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a" CpuId = "adafc99b-e345-5852-983c-f28acb93d879" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" +EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" +EnzymeTestUtils = "12d8515a-0907-448a-8884-5fe00fdf1c5a" FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" @@ -58,6 +64,5 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" [targets] -test = ["AMDGPU", "BenchmarkTools", "CUDA", "ChainRulesTestUtils", "CpuId", "Documenter", - "FiniteDifferences", "ForwardDiff", "Logging", "LoopVectorization", - "ReverseDiff", "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN"] +test = ["AMDGPU", "BenchmarkTools", "CUDA", "ChainRulesTestUtils", "CpuId", "Documenter", "FiniteDifferences", "ForwardDiff", "Logging", "LoopVectorization", "ReverseDiff", "StableRNGs", "Test", "UnicodePlots", "Zygote", "cuDNN", + "Enzyme", "EnzymeCore", "EnzymeTestUtils"] From f58437714575bcd710ec3dc78ba0acd5fc3f03ef Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Sat, 30 Sep 2023 23:10:40 +0200 Subject: [PATCH 24/37] Add files via upload --- .../NNlibLoopVectorizationExt.jl | 5 ++ ext/NNlibLoopVectorizationExt/conv.jl | 76 +++++++++++++++++-- ext/NNlibLoopVectorizationExt/pooling.jl | 2 +- 3 files changed, 76 insertions(+), 7 deletions(-) diff --git a/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl b/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl index d2053161c..3be6c28a2 100644 --- a/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl +++ b/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl @@ -3,6 +3,11 @@ module NNlibLoopVectorizationExt using NNlib using LoopVectorization using Random, Statistics +using OffsetArrays, Static + +# Bevor die Methoden überladen werden, sollte ein Selfcheck durchgeführt werden, ob die Ergebnisse mit NNlib übereinstimmen. +# Wenn nicht, sollte eine Warnung angezeigt werden und die wenn irgendwie möglich, nur die funktionierenden Methoden verwendet werden. +# Z.b. in dem bei falschem Ergebniss, die im2col Methode als Backend in der überladenen Methode aufgerufen wird. include("conv.jl") include("pooling.jl") diff --git a/ext/NNlibLoopVectorizationExt/conv.jl b/ext/NNlibLoopVectorizationExt/conv.jl index 3aafe9246..dd59ecb0c 100644 --- a/ext/NNlibLoopVectorizationExt/conv.jl +++ b/ext/NNlibLoopVectorizationExt/conv.jl @@ -6,8 +6,11 @@ Using stride > 1, dilation > 1 or groups > 1 can slow down things a bit. Since the current state of LoopVectorization ∇conv_filter! isn't really faster than the original implementation in some situations, it is left out for the moment. -Implementation copied from here (Jonas Steinebach, MIT): +Implementation for forward pass mostly copied from here (Jonas Steinebach, MIT license): https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_convolution.jl + +Implementation for backward pass mostly copied from here (Chris Elrod, MIT license): +https://github.com/PumasAI/SimpleChains.jl/blob/main/src/conv.jl =# function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} @@ -123,21 +126,44 @@ function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{ x_dilation, y_dilation = cdims.dilation out_channels_per_group = out_channels ÷ groups + @inline static_size(x::AbstractArray{T, N}) where {T, N} = static.(size(x)) + # actual computation (using @tturbo instead of Threads.@threads + @turbo may end up in wrong results) if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance # println("backward: very specialized case for maximum performance") - Threads.@threads for index_batch in 1:current_batch_size - @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width - for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width - input_gradient_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] + output_gradient = OffsetArray(output_gradient, OffsetArrays.Origin(0, 0, 0, 0)) + input_gradient_padded = OffsetArray(input_gradient_padded, OffsetArrays.Origin(0, 0, 0, 0)) + weight = OffsetArray(weight, OffsetArrays.Origin(0, 0, 0, 0)) + + input_width, input_height, in_channels, batch_size = static_size(input_gradient_padded) + weight_width, weight_height, in_channels_weight, out_channels = static_size(weight) + + y_upper_bound = static(output_height) # input_width - weight_width + static(1) + x_upper_bound = static(output_width) # input_height - weight_height + static(1) + + @tturbo for index_batch in 0:batch_size-1 + for x_in in 0:input_width-1, y_in in 0:input_height-1, in_channel in 0:in_channels-1 # @tturbo unroll = (2, 1) + + value = zero(T) + for x_w in 0:weight_width-1, y_w in 0:weight_height-1, out_channel in 0:out_channels-1 + ib0 = (x_in - x_w >= 0) & (x_in - x_w < x_upper_bound) + ib1 = (y_in - y_w >= 0) & (y_in - y_w < y_upper_bound) + output_gradient_value = (ib0 & ib1) ? output_gradient[x_in-x_w, y_in-y_w, out_channel, index_batch] : zero(T) + value += weight[x_w, y_w, in_channel, out_channel] * output_gradient_value + # value += (ib0 & ib1) ? output_gradient[x_in-x_w, y_in-y_w, out_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] : zero(T) end + input_gradient_padded[x_in, y_in, in_channel, index_batch] = value + end end - elseif groups == 1 # second specialized case for better performance + input_gradient_padded = input_gradient_padded.parent + + elseif groups == 1 && cdims.dilation == (1, 1) # second specialized case for better performance # println("backward: second specialized case for better performance") + #= Threads.@threads for index_batch in 1:current_batch_size @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width m = y_out + (y_stride - 1) * (y_out - 1) @@ -149,6 +175,44 @@ function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{ end end end + =# + + y_out_indices = Array{Int, 3}(undef, weight_width, weight_height, input_height) + x_out_indices = Array{Int, 3}(undef, weight_width, weight_height, input_width) + x_out_indices .= -1 + y_out_indices .= -1 + + @turbo for y_out in 1:output_height, x_out in 1:output_width # + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) + for y_w in 1:weight_height, x_w in 1:weight_width + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation + y_out_indices[x_w, y_w, y_in] = y_out + x_out_indices[x_w, y_w, x_in] = x_out + end + end + + @tturbo for index_batch in 1:current_batch_size + for x_in in 1:input_width, y_in in 1:input_height, in_channel in 1:in_channels # @tturbo unroll = (2, 1) + + value = zero(T) + for x_w in 1:weight_width, y_w in 1:weight_height, out_channel in 1:out_channels + + x_out = x_out_indices[x_w, y_w, x_in] + y_out = y_out_indices[x_w, y_w, y_in] + + ib0 = x_out > -1 # != + ib1 = y_out > -1 # != + + output_gradient_value = (ib0 & ib1) ? output_gradient[x_out, y_out, out_channel, index_batch] : zero(T) + # output_gradient_value = T(2.0) # output_gradient[x_out, y_out, out_channel, index_batch] + value += weight[x_w, y_w, in_channel, out_channel] * output_gradient_value + end + input_gradient[x_in, y_in, in_channel, index_batch] = value + + end + end else # general case for any convolution # println("backward: general case for any convolution") diff --git a/ext/NNlibLoopVectorizationExt/pooling.jl b/ext/NNlibLoopVectorizationExt/pooling.jl index dc948335b..420bb7413 100644 --- a/ext/NNlibLoopVectorizationExt/pooling.jl +++ b/ext/NNlibLoopVectorizationExt/pooling.jl @@ -5,7 +5,7 @@ The speed up is usually lower compared to conv but can be approximately up to 2x Since the current state of LoopVectorization ∇meanpool! isn't really faster than the original implementation in some situations, it is left out for the moment. -Implementation inspired from here (Jonas Steinebach, MIT): +Implementation inspired from here (Jonas Steinebach, MIT license): https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_pooling.jl =# From 274db102b4a2e5643df96e1ed0415719b30e212a Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Sat, 30 Sep 2023 23:10:53 +0200 Subject: [PATCH 25/37] Add files via upload --- Project.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index d267c698f..fe9314f97 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "NNlib" uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" -version = "0.9.6" +version = "0.9.7" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" @@ -9,9 +9,11 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Requires = "ae029012-a4dd-5104-9daa-d747884805df" +Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [weakdeps] From 6c33d5cb94505acde722b64d2a49adb3571ad666 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Sat, 30 Sep 2023 23:11:32 +0200 Subject: [PATCH 26/37] Add files via upload --- test/ext_loopvectorization/runtests.jl | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl index 932a04a57..f9b5bf867 100644 --- a/test/ext_loopvectorization/runtests.jl +++ b/test/ext_loopvectorization/runtests.jl @@ -35,7 +35,7 @@ end @testset "Convolution & Pooling" begin dtype = Float32 # Float64 - batch_size = 64 # 32 + batch_size = 64 # 1 # 64 # 32 input = rand(dtype, 224, 224, 3, batch_size) # for conv & pool weight_ungrouped = rand(dtype, 5, 5, 3, 27) # for conv weight_grouped = rand(dtype, 5, 5, 1, 27) # for grouped conv @@ -53,7 +53,6 @@ end NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 0), dilation=(2, 1)), # test 'general case' ] - #= # compute outputs before loading LoopVectorization println("without LoopVectorization") @@ -90,8 +89,5 @@ end @testset "Conv impl 3" begin @test isapprox(conv_grads_std[3], conv_grads_lv[3]) end - =# - - include("minimal_test.jl") end \ No newline at end of file From 7affd464c174413d8d24ff05f5ef7b30041a0862 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Tue, 3 Oct 2023 14:42:48 +0200 Subject: [PATCH 27/37] Add files via upload --- ext/NNlibLoopVectorizationExt/conv.jl | 455 ++++++++++++++++++-------- 1 file changed, 327 insertions(+), 128 deletions(-) diff --git a/ext/NNlibLoopVectorizationExt/conv.jl b/ext/NNlibLoopVectorizationExt/conv.jl index dd59ecb0c..e0876db0f 100644 --- a/ext/NNlibLoopVectorizationExt/conv.jl +++ b/ext/NNlibLoopVectorizationExt/conv.jl @@ -1,18 +1,273 @@ -#= -Accelerated convolution for 2d-images using the power of LoopVectorization. -The acceleration is usually greatest when the inputs have a large spatial size and few channels. -Using stride > 1, dilation > 1 or groups > 1 can slow down things a bit. +# # pad naivly, static iters +function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real} + + # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise) + size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4]) + cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel) + NNlib.check_dims(size(input), size_weight_check_dims, size(output), cdims_check_dims) + + # padding is done naively at the moment + if cdims.padding != (0, 0, 0, 0) + input = NNlib.pad_zeros(input, cdims.padding, dims=(1, 2)) + end + + output_width, output_height, _ = size(output) + input_width, input_height, in_channels, batch_size = size(input) + weight_width, weight_height, in_channels_weight, out_channels = size(weight) + + # it's necessary to flip the kernel if real convolution is performed (flipkernel=false) + if !NNlib.flipkernel(cdims) + weight = reverse(weight, dims=(1, 2)) + end + + groups = cdims.groupcount + x_stride, y_stride = cdims.stride + x_dilation, y_dilation = cdims.dilation + out_channels_per_group = out_channels ÷ groups + + if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance + # println("forward: very specialized case for maximum performance") + + @tturbo for index_batch in 1:batch_size + for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width + value = zero(T) + for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + value += input[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] + end + output[x_out, y_out, out_channel, index_batch] = value + end + end + + elseif groups == 1 && cdims.dilation == (1, 1) # second specialized case for better performance + # println("forward: second specialized case for better performance") + + @tturbo for index_batch in 1:batch_size + for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width + m = y_out + static((y_stride - 1)) * (y_out - 1) + n = x_out + static((x_stride - 1)) * (x_out - 1) + value = zero(T) + for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + # y_in = m + (y_w - 1) + # x_in = n + (x_w - 1) + value += input[n + (x_w - 1), m + (y_w - 1), in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] + end + output[x_out, y_out, out_channel, index_batch] = value + end + end + + elseif groups == 1 # third specialized case for better performance + # println("forward: third specialized case for better performance") + + @tturbo for index_batch in 1:batch_size + for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width + m = y_out + static((y_stride - 1)) * (y_out - 1) + n = x_out + static((x_stride - 1)) * (x_out - 1) + value = zero(T) + for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + # y_in = m + (y_w - 1) * y_dilation + # x_in = n + (x_w - 1) * x_dilation + value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] + end + output[x_out, y_out, out_channel, index_batch] = value + end + end + + else # general case for any convolution + # println("forward: general case for any convolution") + + @tturbo for index_batch in 1:batch_size + for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in 1:output_height, x_out in 1:output_width + m = y_out + static((y_stride - 1)) * (y_out - 1) + n = x_out + static((x_stride - 1)) * (x_out - 1) + out_channel = (group * out_channels_per_group + 1) - out_channel_per_group + value = zero(T) + for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + # y_in = m + (y_w - 1) * y_dilation + # x_in = n + (x_w - 1) * x_dilation + in_channel_input = in_channel_weight + (group - 1) * in_channels_weight + value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel] + end + output[x_out, y_out, out_channel, index_batch] = value + end + end + + end + + return output +end + +#= # pad bounds check +function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} + + # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise) + size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4]) + cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel) + NNlib.check_dims(size(input), size_weight_check_dims, size(output), cdims_check_dims) + + output_width, output_height, _ = size(output) + input_width, input_height, in_channels, batch_size = size(input) + weight_width, weight_height, in_channels_weight, out_channels = size(weight) + + # it's necessary to flip the kernel if real convolution is performed (flipkernel=false) + if !NNlib.flipkernel(cdims) + weight = reverse(weight, dims=(1, 2)) + end + + groups = cdims.groupcount + x_stride, y_stride = cdims.stride + x_dilation, y_dilation = cdims.dilation + x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding + out_channels_per_group = out_channels ÷ groups + + # We use calc_padding_regions to split outselves up into separate regions that may or + # may not need to worry about padding: + cdims_3d = DenseConvDims((input_width, input_height, 1, in_channels, batch_size), (weight_width, weight_height, 1, in_channels_weight, out_channels), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1)) + # println(pdims_3d.padding) + padded_regions, central_region = NNlib.calc_padding_regions(cdims_3d) + + # Start with the central region + w_region, h_region, _ = central_region + + if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance + # println("forward: very specialized case for maximum performance") + + @tturbo for index_batch in 1:batch_size + for out_channel in 1:out_channels, y_out in h_region, x_out in w_region + value = zero(T) + for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + value += input[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] + end + output[x_out, y_out, out_channel, index_batch] = value + end + end + + elseif groups == 1 && cdims.dilation == (1, 1) # second specialized case for better performance + # println("forward: second specialized case for better performance") + + @tturbo for index_batch in 1:batch_size + for out_channel in 1:out_channels, y_out in h_region, x_out in w_region + m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) + n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) + value = zero(T) + for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + # y_in = m + (y_w - 1) + # x_in = n + (x_w - 1) + value += input[n + (x_w - 1), m + (y_w - 1), in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] + end + output[x_out, y_out, out_channel, index_batch] = value + end + end + + elseif groups == 1 # third specialized case for better performance + # println("forward: third specialized case for better performance") + + @tturbo for index_batch in 1:batch_size + for out_channel in 1:out_channels, y_out in h_region, x_out in w_region + m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) + n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) + value = zero(T) + for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + # y_in = m + (y_w - 1) * y_dilation + # x_in = n + (x_w - 1) * x_dilation + value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] + end + output[x_out, y_out, out_channel, index_batch] = value + end + end + + else # general case for any convolution + # println("forward: general case for any convolution") + + @tturbo for index_batch in 1:batch_size + for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in h_region, x_out in w_region + m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) + n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) + out_channel = (group * out_channels_per_group + 1) - out_channel_per_group + value = zero(T) + for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + # y_in = m + (y_w - 1) * y_dilation + # x_in = n + (x_w - 1) * x_dilation + in_channel_input = in_channel_weight + (group - 1) * in_channels_weight + value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel] + end + output[x_out, y_out, out_channel, index_batch] = value + end + end + + end + + # @show w_region # 1:0 warning: when padding is unequal and one of x_pad1 or y_pad1 is 0, emty collections are possible + # @show h_region # 1:0 if isempty(1:0) -> true + # println() + + if cdims.padding != (0, 0, 0, 0) + # Next, the padded regions + for (w_region, h_region, d_region) in padded_regions # @inbounds + for z_out in d_region # for skipping the d_regions + + if cdims.groupcount == 1 + + @tturbo for index_batch in 1:batch_size # @turbo + for out_channel in 1:out_channels, y_out in h_region, x_out in w_region + m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) + n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) + value = zero(T) + for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation -Since the current state of LoopVectorization ∇conv_filter! isn't really faster than the -original implementation in some situations, it is left out for the moment. + is_in_bound_x = (x_in >= 1) & (x_in <= input_width) + is_in_bound_y = (y_in >= 1) & (y_in <= input_height) -Implementation for forward pass mostly copied from here (Jonas Steinebach, MIT license): -https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_convolution.jl + input_value = (is_in_bound_x & is_in_bound_y) ? input[x_in, y_in, in_channel, index_batch] : zero(T) + value += input_value * weight[x_w, y_w, in_channel, out_channel] -Implementation for backward pass mostly copied from here (Chris Elrod, MIT license): -https://github.com/PumasAI/SimpleChains.jl/blob/main/src/conv.jl + # value += (ib0 & ib1) ? input[x_in, y_in, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] : zero(T) + + end + output[x_out, y_out, out_channel, index_batch] = value + end + end + + else + + @tturbo for index_batch in 1:batch_size # @turbo + for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in h_region, x_out in w_region + m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) + n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) + out_channel = (group * out_channels_per_group + 1) - out_channel_per_group + value = zero(T) + for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation + + in_channel_input = in_channel_weight + (group - 1) * in_channels_weight + + is_in_bound_x = (x_in >= 1) & (x_in <= input_width) + is_in_bound_y = (y_in >= 1) & (y_in <= input_height) + + input_value = (is_in_bound_x & is_in_bound_y) ? input[x_in, y_in, in_channel_input, index_batch] : zero(T) + value += input_value * weight[x_w, y_w, in_channel_weight, out_channel] + + # value += (ib0 & ib1) ? input[x_in, y_in, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel] : zero(T) + + end + output[x_out, y_out, out_channel, index_batch] = value + end + end + + end + end + end + end + + return output +end =# +#= function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise) @@ -92,66 +347,81 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, return output end +=# + +function ∇conv_data_im2col_grouped!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real} + + ∇conv_data!( + NNlib.insert_singleton_spatial_dimension(input_gradient, 1), + NNlib.insert_singleton_spatial_dimension(output_gradient, 1), + NNlib.insert_singleton_spatial_dimension(weight, 1), + NNlib.insert_singleton_spatial_dimension(cdims, 1) + ) + + return input_gradient +end + +function ∇conv_filter_im2col_grouped!(weight_gradient::Array{T,4}, input::Array{T,4}, output_gradient::Array{T,4}, cdims::ConvDims) where {T<:Real} + + ∇conv_filter!( + NNlib.insert_singleton_spatial_dimension(weight_gradient, 1), + NNlib.insert_singleton_spatial_dimension(input, 1), + NNlib.insert_singleton_spatial_dimension(output_gradient, 1), + NNlib.insert_singleton_spatial_dimension(cdims, 1) + ) + + return weight_gradient +end + +function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real} -function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} - # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise) size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4]) cdims_check_dims = DenseConvDims(size(input_gradient), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel) NNlib.check_dims(size(input_gradient), size_weight_check_dims, size(output_gradient), cdims_check_dims) - - # storing all the necessary shapes - output_width, output_height, out_channels, current_batch_size = size(output_gradient) - weight_width, weight_height, in_channels_weight, out_channels = size(weight) - - # because in the actual computation section, values are added, it's saver to reset the given input_gradient first - input_gradient .= zero(T) - # check if input_gradient must be padded (padding is done naively at the moment) - if cdims.padding != (0, 0, 0, 0) - input_gradient_padded = NNlib.pad_zeros(input_gradient, cdims.padding, dims=(1, 2)) - else - input_gradient_padded = input_gradient - end - # store the size of input after padding - input_width, input_height, in_channels, current_batch_size = size(input_gradient_padded) # size after padding - - # it's necessary to flip the kernel if real convolution is performed (flipkernel=false) - if !NNlib.flipkernel(cdims) - weight = reverse(weight, dims=(1, 2)) - end + if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance + # println("backward: very specialized case for maximum performance") - groups = cdims.groupcount - x_stride, y_stride = cdims.stride - x_dilation, y_dilation = cdims.dilation - out_channels_per_group = out_channels ÷ groups + # storing all the necessary shapes + output_width, output_height, out_channels, batch_size = size(output_gradient) + weight_width, weight_height, in_channels_weight, out_channels = size(weight) + + # because in the actual computation section, values are added, it's saver to reset the given input_gradient first + input_gradient .= zero(T) + # check if input_gradient must be padded (padding is done naively at the moment) + if cdims.padding != (0, 0, 0, 0) + input_gradient_padded = NNlib.pad_zeros(input_gradient, cdims.padding, dims=(1, 2)) + else + input_gradient_padded = input_gradient + end - @inline static_size(x::AbstractArray{T, N}) where {T, N} = static.(size(x)) + # store the size of input after padding + input_width, input_height, in_channels, batch_size = size(input_gradient_padded) # size after padding - # actual computation (using @tturbo instead of Threads.@threads + @turbo may end up in wrong results) - if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance - # println("backward: very specialized case for maximum performance") + # it's necessary to flip the kernel if real convolution is performed (flipkernel=false) + if !NNlib.flipkernel(cdims) + weight = reverse(weight, dims=(1, 2)) + end output_gradient = OffsetArray(output_gradient, OffsetArrays.Origin(0, 0, 0, 0)) input_gradient_padded = OffsetArray(input_gradient_padded, OffsetArrays.Origin(0, 0, 0, 0)) weight = OffsetArray(weight, OffsetArrays.Origin(0, 0, 0, 0)) - input_width, input_height, in_channels, batch_size = static_size(input_gradient_padded) - weight_width, weight_height, in_channels_weight, out_channels = static_size(weight) - - y_upper_bound = static(output_height) # input_width - weight_width + static(1) - x_upper_bound = static(output_width) # input_height - weight_height + static(1) + input_width, input_height, in_channels, batch_size = size(input_gradient_padded) + weight_width, weight_height, in_channels_weight, out_channels = size(weight) @tturbo for index_batch in 0:batch_size-1 - for x_in in 0:input_width-1, y_in in 0:input_height-1, in_channel in 0:in_channels-1 # @tturbo unroll = (2, 1) + for x_in in 0:input_width-1, y_in in 0:input_height-1, in_channel in 0:in_channels-1 value = zero(T) - for x_w in 0:weight_width-1, y_w in 0:weight_height-1, out_channel in 0:out_channels-1 - ib0 = (x_in - x_w >= 0) & (x_in - x_w < x_upper_bound) - ib1 = (y_in - y_w >= 0) & (y_in - y_w < y_upper_bound) - output_gradient_value = (ib0 & ib1) ? output_gradient[x_in-x_w, y_in-y_w, out_channel, index_batch] : zero(T) + for x_w in static(0):static(weight_width-1), y_w in static(0):static(weight_height-1), out_channel in static(0):static(out_channels-1) + + is_in_bound_x = (x_in - x_w >= 0) & (x_in - x_w < output_width) + is_in_bound_y = (y_in - y_w >= 0) & (y_in - y_w < output_height) + output_gradient_value = (is_in_bound_x & is_in_bound_y) ? output_gradient[x_in - x_w, y_in - y_w, out_channel, index_batch] : zero(T) value += weight[x_w, y_w, in_channel, out_channel] * output_gradient_value - # value += (ib0 & ib1) ? output_gradient[x_in-x_w, y_in-y_w, out_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] : zero(T) + end input_gradient_padded[x_in, y_in, in_channel, index_batch] = value @@ -160,86 +430,15 @@ function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{ input_gradient_padded = input_gradient_padded.parent - elseif groups == 1 && cdims.dilation == (1, 1) # second specialized case for better performance - # println("backward: second specialized case for better performance") - - #= - Threads.@threads for index_batch in 1:current_batch_size - @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width - m = y_out + (y_stride - 1) * (y_out - 1) - n = x_out + (x_stride - 1) * (x_out - 1) - for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width - y_in = m + (y_w - 1) * y_dilation - x_in = n + (x_w - 1) * x_dilation - input_gradient_padded[x_in, y_in, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] - end - end - end - =# - - y_out_indices = Array{Int, 3}(undef, weight_width, weight_height, input_height) - x_out_indices = Array{Int, 3}(undef, weight_width, weight_height, input_width) - x_out_indices .= -1 - y_out_indices .= -1 - - @turbo for y_out in 1:output_height, x_out in 1:output_width # - m = y_out + (y_stride - 1) * (y_out - 1) - n = x_out + (x_stride - 1) * (x_out - 1) - for y_w in 1:weight_height, x_w in 1:weight_width - y_in = m + (y_w - 1) * y_dilation - x_in = n + (x_w - 1) * x_dilation - y_out_indices[x_w, y_w, y_in] = y_out - x_out_indices[x_w, y_w, x_in] = x_out - end - end - - @tturbo for index_batch in 1:current_batch_size - for x_in in 1:input_width, y_in in 1:input_height, in_channel in 1:in_channels # @tturbo unroll = (2, 1) - - value = zero(T) - for x_w in 1:weight_width, y_w in 1:weight_height, out_channel in 1:out_channels - - x_out = x_out_indices[x_w, y_w, x_in] - y_out = y_out_indices[x_w, y_w, y_in] - - ib0 = x_out > -1 # != - ib1 = y_out > -1 # != - - output_gradient_value = (ib0 & ib1) ? output_gradient[x_out, y_out, out_channel, index_batch] : zero(T) - # output_gradient_value = T(2.0) # output_gradient[x_out, y_out, out_channel, index_batch] - value += weight[x_w, y_w, in_channel, out_channel] * output_gradient_value - end - input_gradient[x_in, y_in, in_channel, index_batch] = value - - end + # depad + if cdims.padding != (0, 0, 0, 0) + x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding + input_gradient .= input_gradient_padded[x_pad1+1:input_width-x_pad2, y_pad1+1:input_height-y_pad2, :, :] end else # general case for any convolution - # println("backward: general case for any convolution") - - Threads.@threads for index_batch in 1:current_batch_size - for out_channel_per_group in 1:out_channels_per_group # putting @turbo here may end up in wrong results - @turbo for group in 1:groups, y_out in 1:output_height, x_out in 1:output_width - m = y_out + (y_stride - 1) * (y_out - 1) - n = x_out + (x_stride - 1) * (x_out - 1) - out_channel = (group * out_channels_per_group + 1) - out_channel_per_group - for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width - y_in = m + (y_w - 1) * y_dilation - x_in = n + (x_w - 1) * x_dilation - in_channel_input = in_channel_weight + (group - 1) * in_channels_weight - input_gradient_padded[x_in, y_in, in_channel_input, index_batch] += weight[x_w, y_w, in_channel_weight, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] - end - end - end - end - - end - - # depad - if cdims.padding != (0, 0, 0, 0) - x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding - input_gradient .= input_gradient_padded[x_pad1+1:input_width-x_pad2, y_pad1+1:input_height-y_pad2, :, :] - end + input_gradient = ∇conv_data_im2col_grouped!(input_gradient, output_gradient, weight, cdims) + end return input_gradient end \ No newline at end of file From 3130f8a02ced2271a433e31728f8b11cfbd0e9b9 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Tue, 3 Oct 2023 14:43:33 +0200 Subject: [PATCH 28/37] Add files via upload --- test/ext_loopvectorization/runtests.jl | 30 ++++++-------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl index f9b5bf867..262ae42d0 100644 --- a/test/ext_loopvectorization/runtests.jl +++ b/test/ext_loopvectorization/runtests.jl @@ -42,15 +42,18 @@ end conv_settings_list = [ NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1), # test 'very specialized case' - NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(2, 1), padding=(0, 0), dilation=(2, 1), groups=1), # test 'second specialized case' - NNlib.DenseConvDims(size(input), size(weight_grouped), stride=(2, 1), padding=(2, 0), dilation=(2, 1), groups=3), # test 'general case' + NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(2, 1), padding=(0, 0), dilation=(1, 1), groups=1), # test 'second specialized case' + NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(2, 1), padding=(0, 0), dilation=(2, 1), groups=1), # test 'third specialized case' + ### NNlib.DenseConvDims(size(input), size(weight_grouped), stride=(2, 1), padding=(2, 0), dilation=(2, 1), groups=3), # test 'general case' + NNlib.DenseConvDims(size(input), size(weight_grouped), stride=(2, 1), padding=(2, 1), dilation=(2, 1), groups=3), # test 'general case' ] conv_output_grads = [rand(dtype, NNlib.output_size(setting)..., 27, batch_size) for setting in conv_settings_list] pool_settings_list = [ NNlib.PoolDims(size(input), (5, 4), stride=(1, 1), padding=(0, 0), dilation=(1, 1)), # test 'specialized case' - NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 0), dilation=(2, 1)), # test 'general case' + # NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 0), dilation=(2, 1)), # test 'general case' + NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 1), dilation=(2, 1)), # test 'general case' ] # compute outputs before loading LoopVectorization @@ -67,27 +70,8 @@ end # validate conv @test all(isapprox.(conv_outs_std, conv_outs_lv)) - # @test all(isapprox.(conv_grads_std, conv_grads_lv)) # seems to be wrong on some CI devices, reason unknown + @test all(isapprox.(conv_grads_std, conv_grads_lv)) # validate pool @test all(isapprox.(pool_outs_std, pool_outs_lv)) - @info isapprox(conv_grads_std[1], conv_grads_lv[1]) - println(sum(conv_grads_std[1])); println(sum(conv_grads_lv[1])) - - @info isapprox(conv_grads_std[2], conv_grads_lv[2]) - println(sum(conv_grads_std[2])); println(sum(conv_grads_lv[2])) - - @info isapprox(conv_grads_std[3], conv_grads_lv[3]) - println(sum(conv_grads_std[3])); println(sum(conv_grads_lv[3])) - - @testset "Conv impl 1" begin - @test isapprox(conv_grads_std[1], conv_grads_lv[1]) - end - @testset "Conv impl 2" begin - @test isapprox(conv_grads_std[2], conv_grads_lv[2]) - end - @testset "Conv impl 3" begin - @test isapprox(conv_grads_std[3], conv_grads_lv[3]) - end - end \ No newline at end of file From d87f909e93239f83a570a846a2e2ba4340216392 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Sat, 7 Oct 2023 23:08:58 +0200 Subject: [PATCH 29/37] Add files via upload --- ext/NNlibLoopVectorizationExt/conv.jl | 166 ++++++++++++++++++++++---- 1 file changed, 143 insertions(+), 23 deletions(-) diff --git a/ext/NNlibLoopVectorizationExt/conv.jl b/ext/NNlibLoopVectorizationExt/conv.jl index e0876db0f..a75026b82 100644 --- a/ext/NNlibLoopVectorizationExt/conv.jl +++ b/ext/NNlibLoopVectorizationExt/conv.jl @@ -1,6 +1,10 @@ -# # pad naivly, static iters +# pad naivly, static iters function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real} + # if cdims.stride != (1, 1) + # return conv_im2col_grouped!(output, input, weight, cdims) + # end + # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise) size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4]) cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel) @@ -32,6 +36,7 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width value = zero(T) for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + # for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width value += input[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] end output[x_out, y_out, out_channel, index_batch] = value @@ -43,13 +48,16 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, @tturbo for index_batch in 1:batch_size for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width - m = y_out + static((y_stride - 1)) * (y_out - 1) - n = x_out + static((x_stride - 1)) * (x_out - 1) + # m = y_out + static((y_stride - 1)) * (y_out - 1) + # n = x_out + static((x_stride - 1)) * (x_out - 1) + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) value = zero(T) for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) # y_in = m + (y_w - 1) # x_in = n + (x_w - 1) - value += input[n + (x_w - 1), m + (y_w - 1), in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] + # value += input[n + (x_w - 1), m + (y_w - 1), in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] + value += input[n + x_w - 1, m + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] end output[x_out, y_out, out_channel, index_batch] = value end @@ -60,8 +68,10 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, @tturbo for index_batch in 1:batch_size for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width - m = y_out + static((y_stride - 1)) * (y_out - 1) - n = x_out + static((x_stride - 1)) * (x_out - 1) + # m = y_out + static((y_stride - 1)) * (y_out - 1) + # n = x_out + static((x_stride - 1)) * (x_out - 1) + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) value = zero(T) for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) # y_in = m + (y_w - 1) * y_dilation @@ -77,8 +87,10 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, @tturbo for index_batch in 1:batch_size for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in 1:output_height, x_out in 1:output_width - m = y_out + static((y_stride - 1)) * (y_out - 1) - n = x_out + static((x_stride - 1)) * (x_out - 1) + # m = y_out + static((y_stride - 1)) * (y_out - 1) + # n = x_out + static((x_stride - 1)) * (x_out - 1) + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) out_channel = (group * out_channels_per_group + 1) - out_channel_per_group value = zero(T) for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) @@ -96,7 +108,8 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, return output end -#= # pad bounds check +#= +# pad bounds check function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise) @@ -146,8 +159,10 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, @tturbo for index_batch in 1:batch_size for out_channel in 1:out_channels, y_out in h_region, x_out in w_region - m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) - n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) + # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) + # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) + m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1 + n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1 value = zero(T) for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) # y_in = m + (y_w - 1) @@ -163,8 +178,10 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, @tturbo for index_batch in 1:batch_size for out_channel in 1:out_channels, y_out in h_region, x_out in w_region - m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) - n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) + # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) + # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) + m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1 + n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1 value = zero(T) for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) # y_in = m + (y_w - 1) * y_dilation @@ -180,8 +197,10 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, @tturbo for index_batch in 1:batch_size for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in h_region, x_out in w_region - m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) - n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) + # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) + # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) + m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1 + n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1 out_channel = (group * out_channels_per_group + 1) - out_channel_per_group value = zero(T) for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) @@ -209,10 +228,13 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, @tturbo for index_batch in 1:batch_size # @turbo for out_channel in 1:out_channels, y_out in h_region, x_out in w_region - m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) - n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) + # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) + # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) + m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1 + n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1 value = zero(T) - for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + # for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width y_in = m + (y_w - 1) * y_dilation x_in = n + (x_w - 1) * x_dilation @@ -234,11 +256,14 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, @tturbo for index_batch in 1:batch_size # @turbo for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in h_region, x_out in w_region - m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) - n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) + # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) + # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) + m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1 + n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1 out_channel = (group * out_channels_per_group + 1) - out_channel_per_group value = zero(T) - for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + # for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width y_in = m + (y_w - 1) * y_dilation x_in = n + (x_w - 1) * x_dilation @@ -349,6 +374,18 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, end =# +function conv_im2col_grouped!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real} + + conv!( + NNlib.insert_singleton_spatial_dimension(output, 1), + NNlib.insert_singleton_spatial_dimension(input, 1), + NNlib.insert_singleton_spatial_dimension(weight, 1), + NNlib.insert_singleton_spatial_dimension(cdims, 1) + ) + + return output +end + function ∇conv_data_im2col_grouped!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real} ∇conv_data!( @@ -415,7 +452,8 @@ function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{ for x_in in 0:input_width-1, y_in in 0:input_height-1, in_channel in 0:in_channels-1 value = zero(T) - for x_w in static(0):static(weight_width-1), y_w in static(0):static(weight_height-1), out_channel in static(0):static(out_channels-1) + # for x_w in static(0):static(weight_width-1), y_w in static(0):static(weight_height-1), out_channel in static(0):static(out_channels-1) # RICHTIG!! + for x_w in 0:weight_width-1, y_w in 0:weight_height-1, out_channel in 0:out_channels-1 is_in_bound_x = (x_in - x_w >= 0) & (x_in - x_w < output_width) is_in_bound_y = (y_in - y_w >= 0) & (y_in - y_w < output_height) @@ -441,4 +479,86 @@ function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{ end return input_gradient -end \ No newline at end of file +end + +#= +function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} + # println("myconv data back called") + + size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4]) + cdims_check_dims = DenseConvDims(size(input_gradient), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel) + NNlib.check_dims(size(input_gradient), size_weight_check_dims, size(output_gradient), cdims_check_dims) + + # storing all the necessary shapes + output_width, output_height, out_channels, current_batch_size = size(output_gradient) + weight_width, weight_height, in_channels_weight, out_channels = size(weight) + # because in the actual computation section, values are added, it's saver to reset the given input_gradient first + input_gradient .= zero(T) + # check if input_gradient must be padded + if cdims.padding != (0, 0, 0, 0) + input_gradient_padded = NNlib.pad_zeros(input_gradient, cdims.padding, dims=(1, 2)) + else + input_gradient_padded = input_gradient + end + # store the size of input after padding + input_width, input_height, in_channels, current_batch_size = size(input_gradient_padded) # size after padding + + if !NNlib.flipkernel(cdims) + weight = reverse(weight, dims=(1, 2)) + end + + groups = cdims.groupcount + x_stride, y_stride = cdims.stride + x_dilation, y_dilation = cdims.dilation + out_channels_per_group = out_channels ÷ groups + # actual computation + if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance + # println("very specialized case for maximum performance") + Threads.@threads for index_batch in 1:current_batch_size + @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + input_gradient_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] + end + end + end + elseif groups == 1 # second specialized case for better performance + # println("second specialized case for better performance") + Threads.@threads for index_batch in 1:current_batch_size + @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation + input_gradient_padded[x_in, y_in, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] + end + end + end + else # general case for any convolution + # println("general case for any convolution") + Threads.@threads for index_batch in 1:current_batch_size + for out_channel_per_group in 1:out_channels_per_group # putting @turbo here may end up in wrong results + @turbo for group in 1:groups, y_out in 1:output_height, x_out in 1:output_width + m = y_out + (y_stride - 1) * (y_out - 1) + n = x_out + (x_stride - 1) * (x_out - 1) + out_channel = (group * out_channels_per_group + 1) - out_channel_per_group + for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation + in_channel_input = in_channel_weight + (group - 1) * in_channels_weight + input_gradient_padded[x_in, y_in, in_channel_input, index_batch] += weight[x_w, y_w, in_channel_weight, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] + end + end + end + end + end + + # depad + if cdims.padding != (0, 0, 0, 0) + x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding + input_gradient .= input_gradient_padded[x_pad1+1:input_width-x_pad2, y_pad1+1:input_height-y_pad2, :, :] + end + + return input_gradient +end +=# \ No newline at end of file From 82abca83149c8824ba4eca1400076c3074212aec Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Sat, 7 Oct 2023 23:09:26 +0200 Subject: [PATCH 30/37] Add files via upload --- bench_torch.py | 32 +++++++++++++++ benchmark.jl | 76 ++++++++++++++++++++++++++++++++++++ benchmark_result_julia.csv | 8 ++++ benchmark_result_pytorch.csv | 8 ++++ benchmark_torch.py | 64 ++++++++++++++++++++++++++++++ 5 files changed, 188 insertions(+) create mode 100644 bench_torch.py create mode 100644 benchmark.jl create mode 100644 benchmark_result_julia.csv create mode 100644 benchmark_result_pytorch.csv create mode 100644 benchmark_torch.py diff --git a/bench_torch.py b/bench_torch.py new file mode 100644 index 000000000..1a97dc673 --- /dev/null +++ b/bench_torch.py @@ -0,0 +1,32 @@ +import torch +import torchvision.models as models +from torch.profiler import profile, record_function, ProfilerActivity +import time + +model = models.efficientnet_v2_m() +model.eval() + +b_size = 1 +img = torch.rand(b_size, 3, 224, 224) + +with profile(activities=[ProfilerActivity.CPU], record_shapes=True, profile_memory=True) as prof: + with record_function("model_inference"): + pred = model(img) + """ + with record_function("model_backward"): + loss = torch.sum(pred - 0.5) # dummy loss + loss.backward() + """ + +print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=-1)) +# print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=-1)) + +start1 = time.perf_counter() +pred = model(img) +start2 = time.perf_counter() +loss = torch.sum(pred - 0.5) # dummy loss +loss.backward() +end = time.perf_counter() +print(f"Time used inference: {start2 - start1} seconds") +print(f"Time used backward: {end - start2} seconds") +print(f"Time used inference and backward: {end - start1} seconds") \ No newline at end of file diff --git a/benchmark.jl b/benchmark.jl new file mode 100644 index 000000000..83262f6e0 --- /dev/null +++ b/benchmark.jl @@ -0,0 +1,76 @@ +using NNlib, Flux, Metalhead +using BenchmarkTools +using DataFrames, CSV + +forward(model, input) = model(input) + +dummy_loss(output) = sum(output .- 1) + +function train_step(model, input) + ∇model, ∇input = gradient(model, input) do m, x + dummy_loss(m(x)) + end + return ∇model, ∇input +end + +function benchmark(models, dtype, batch_sizes, channels, spatial_size) + model_names = sort(collect(keys(models))) # make sure the models are always in the same order + forward_times = zeros(length(model_names), length(batch_sizes)) + train_step_times = zeros(length(model_names), length(batch_sizes)) + + for (i, model_name) in enumerate(model_names) + println("Benchmarking $model_name...") + for (j, batch_size) in enumerate(batch_sizes) + + input = rand(dtype, spatial_size..., channels, batch_size) + model = models[model_name] + + forward(model, input) # compilation + train_step(model, input) # compilation + + forward_times[i, j] = @belapsed forward($model, $input) # @elapsed + train_step_times[i, j] = @belapsed train_step($model, $input) # @elapsed + + end + end + + return forward_times, train_step_times +end + +# models which should be benchmarked +models = Dict( + "ResNet18" => ResNet(18), + "WideResNet50" => WideResNet(50), + "DenseNet121" => DenseNet(121), + "EfficientNet" => EfficientNet(:b0), + "EfficientNetv2" => EfficientNetv2(:small), + "MobileNetv3" => MobileNetv3(:small), + # "GoogLeNet" => GoogLeNet(), + "ConvNeXt" => ConvNeXt(:tiny), +) + +# the data type and batch sizes which should be benchmarked +dtype = Float32 +batch_sizes = (1, 32) +# size information (e.g. ImageNet-like images) +channels = 3 +spatial_size = (224, 224) # WH + +forward_times1, train_step_times1 = benchmark(models, dtype, batch_sizes, channels, spatial_size) +using LoopVectorization # load LoopVectorization here to load the lv-extension +forward_times2, train_step_times2 = benchmark(models, dtype, batch_sizes, channels, spatial_size) + +df = DataFrame() +df[!, "model_names"] = sort(collect(keys(models))) # make sure the models are always in the same order + +for (i, batch_size) in enumerate(batch_sizes) + df[!, "acceleration inference, batch_size: $batch_size"] = forward_times1[:, i] ./ forward_times2[:, i] + df[!, "acceleration train, batch_size: $batch_size"] = train_step_times1[:, i] ./ train_step_times2[:, i] + + df[!, "im2col, inference, batch_size: $batch_size"] = forward_times1[:, i] + df[!, "lv-ext, inference, batch_size: $batch_size"] = forward_times2[:, i] + df[!, "im2col, train, batch_size: $batch_size"] = train_step_times1[:, i] + df[!, "lv-ext, train, batch_size: $batch_size"] = train_step_times2[:, i] +end + +CSV.write("benchmark_result_julia.csv", df) \ No newline at end of file diff --git a/benchmark_result_julia.csv b/benchmark_result_julia.csv new file mode 100644 index 000000000..13c60f90d --- /dev/null +++ b/benchmark_result_julia.csv @@ -0,0 +1,8 @@ +model_names,"acceleration inference, batch_size: 1","acceleration train, batch_size: 1","im2col, inference, batch_size: 1","lv-ext, inference, batch_size: 1","im2col, train, batch_size: 1","lv-ext, train, batch_size: 1","acceleration inference, batch_size: 32","acceleration train, batch_size: 32","im2col, inference, batch_size: 32","lv-ext, inference, batch_size: 32","im2col, train, batch_size: 32","lv-ext, train, batch_size: 32" +ConvNeXt,2.696525688337994,0.8577982223471639,0.3655661,0.1355693,1.0179392,1.1866884,1.1012307960913856,0.9863925898269857,4.8062238,4.364411,14.39549,14.5940776 +DenseNet121,2.8334019239672825,1.5740008937784382,0.1762512,0.0622048,0.7033689,0.4468669,1.263441740346075,0.9985112159016455,2.8772053,2.2772758,13.7427543,13.7632448 +EfficientNet,2.931731906635844,1.3684833239947383,0.1895995,0.0646715,0.7397858,0.5405881,1.0664467750727167,1.1269442586755343,2.4186004,2.2679054,8.6440818,7.6703721 +EfficientNetv2,5.053202171229253,5.590555646664826,0.6446749,0.1275775,12.9539883,2.31712,1.4478919823022125,1.224915348029691,6.3059734,4.3552789,22.0001035,17.9605093 +MobileNetv3,6.416791193464834,1.0744301651818315,0.0515294,0.0080304,0.1731701,0.1611739,1.2257825196180756,1.0673281389045863,0.3963721,0.3233625,1.6299691,1.527149 +ResNet18,1.2245235336423206,1.1195501549857,0.0490358,0.0400448,0.1998762,0.1785326,1.0889384258792703,0.9128196230532792,0.8104152,0.744225,3.1845247,3.4886681 +WideResNet50,0.670115626920713,0.7518492599095661,0.1744445,0.26032,0.7198077,0.957383,0.9693667932236841,0.8498274823283908,4.0205619,4.1476167,15.3664104,18.0817998 diff --git a/benchmark_result_pytorch.csv b/benchmark_result_pytorch.csv new file mode 100644 index 000000000..5f929352b --- /dev/null +++ b/benchmark_result_pytorch.csv @@ -0,0 +1,8 @@ +,model_names,"inference, batch_size: 1","train, batch_size: 1","inference, batch_size: 32","train, batch_size: 32" +0,ConvNeXt,0.0805797,0.2446732,1.598144,4.723875 +1,DenseNet121,0.1164808,0.2522871,1.5435163,4.5821567 +2,EfficientNet,0.0598909,0.1230739,0.9899088,2.6415434 +3,EfficientNetv2,0.13141,0.2856547,1.8736856,5.1560116 +4,MobileNetv3,0.0220434,0.0392312,0.2386686,0.5895018 +5,ResNet18,0.0289765,0.0681748,0.4151833,1.2125939 +6,WideResNet50,0.1647507,0.4650428,2.3093054,6.9086385 diff --git a/benchmark_torch.py b/benchmark_torch.py new file mode 100644 index 000000000..55705ad8b --- /dev/null +++ b/benchmark_torch.py @@ -0,0 +1,64 @@ +import torch +import torchvision.models as visionmodels +import time +import pandas as pd + +def dummy_loss(output): + return torch.sum(output - 1) + +def train_step(model, input_to_model): + output = model(input_to_model) + loss = dummy_loss(output) + loss.backward() + +def benchmark(models, batch_sizes, channels, spatial_size): + model_names = sorted(list(models.keys())) # make sure the models are always in the same order + forward_times = torch.zeros(len(model_names), len(batch_sizes)) + train_step_times = torch.zeros(len(model_names), len(batch_sizes)) + + for i, model_name in enumerate(model_names): + print(f"Benchmarking {model_name}...") + for j, batch_size in enumerate(batch_sizes): + + input_to_model = torch.rand(batch_size, channels, spatial_size[0], spatial_size[1]) + model = models[model_name] + + time_start = time.perf_counter() + model(input_to_model) + time_duration = time.perf_counter() - time_start + forward_times[i, j] = time_duration + + time_start = time.perf_counter() + train_step(model, input_to_model) + time_duration = time.perf_counter() - time_start + train_step_times[i, j] = time_duration + + return forward_times, train_step_times + +models = { + "ResNet18" : visionmodels.resnet18(), + "WideResNet50" : visionmodels.wide_resnet50_2(), + "DenseNet121" : visionmodels.densenet121(), + "EfficientNet" : visionmodels.efficientnet_b0(), + "EfficientNetv2" : visionmodels.efficientnet_v2_s(), + "MobileNetv3" : visionmodels.mobilenet_v3_small(), + # "GoogLeNet" : visionmodels.googlenet(), + "ConvNeXt" : visionmodels.convnext_tiny(), +} + +# the batch sizes which should be benchmarked +batch_sizes = (1, 32) +# size information (e.g. ImageNet-like images) +channels = 3 +spatial_size = (224, 224) # HW + +forward_times, train_step_times = benchmark(models, batch_sizes, channels, spatial_size) + +df = pd.DataFrame() +df["model_names"] = sorted(list(models.keys())) # make sure the models are always in the same order + +for (i, batch_size) in enumerate(batch_sizes): + df[f"inference, batch_size: {batch_size}"] = forward_times[:, i] + df[f"train, batch_size: {batch_size}"] = train_step_times[:, i] + +df.to_csv("benchmark_result_pytorch.csv") \ No newline at end of file From c5ec7138fbd0e232dfa59482883ede2c44a2bef8 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Sun, 8 Oct 2023 19:32:28 +0200 Subject: [PATCH 31/37] Delete bench_torch.py --- bench_torch.py | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 bench_torch.py diff --git a/bench_torch.py b/bench_torch.py deleted file mode 100644 index 1a97dc673..000000000 --- a/bench_torch.py +++ /dev/null @@ -1,32 +0,0 @@ -import torch -import torchvision.models as models -from torch.profiler import profile, record_function, ProfilerActivity -import time - -model = models.efficientnet_v2_m() -model.eval() - -b_size = 1 -img = torch.rand(b_size, 3, 224, 224) - -with profile(activities=[ProfilerActivity.CPU], record_shapes=True, profile_memory=True) as prof: - with record_function("model_inference"): - pred = model(img) - """ - with record_function("model_backward"): - loss = torch.sum(pred - 0.5) # dummy loss - loss.backward() - """ - -print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=-1)) -# print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=-1)) - -start1 = time.perf_counter() -pred = model(img) -start2 = time.perf_counter() -loss = torch.sum(pred - 0.5) # dummy loss -loss.backward() -end = time.perf_counter() -print(f"Time used inference: {start2 - start1} seconds") -print(f"Time used backward: {end - start2} seconds") -print(f"Time used inference and backward: {end - start1} seconds") \ No newline at end of file From 3f1c6dc6adbc20b5653efa1a39b61d475ec1e4f0 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Sun, 8 Oct 2023 19:34:39 +0200 Subject: [PATCH 32/37] Add files via upload --- bench_torch.py | 32 ++++++++++++++++++++++++++++++++ benchmark.jl | 12 +++++++++--- benchmark_result_julia.csv | 14 +++++++------- benchmark_result_pytorch.csv | 14 +++++++------- 4 files changed, 55 insertions(+), 17 deletions(-) create mode 100644 bench_torch.py diff --git a/bench_torch.py b/bench_torch.py new file mode 100644 index 000000000..1a97dc673 --- /dev/null +++ b/bench_torch.py @@ -0,0 +1,32 @@ +import torch +import torchvision.models as models +from torch.profiler import profile, record_function, ProfilerActivity +import time + +model = models.efficientnet_v2_m() +model.eval() + +b_size = 1 +img = torch.rand(b_size, 3, 224, 224) + +with profile(activities=[ProfilerActivity.CPU], record_shapes=True, profile_memory=True) as prof: + with record_function("model_inference"): + pred = model(img) + """ + with record_function("model_backward"): + loss = torch.sum(pred - 0.5) # dummy loss + loss.backward() + """ + +print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=-1)) +# print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=-1)) + +start1 = time.perf_counter() +pred = model(img) +start2 = time.perf_counter() +loss = torch.sum(pred - 0.5) # dummy loss +loss.backward() +end = time.perf_counter() +print(f"Time used inference: {start2 - start1} seconds") +print(f"Time used backward: {end - start2} seconds") +print(f"Time used inference and backward: {end - start1} seconds") \ No newline at end of file diff --git a/benchmark.jl b/benchmark.jl index 83262f6e0..072144d51 100644 --- a/benchmark.jl +++ b/benchmark.jl @@ -1,5 +1,5 @@ using NNlib, Flux, Metalhead -using BenchmarkTools +using BenchmarkTools, Statistics using DataFrames, CSV forward(model, input) = model(input) @@ -28,8 +28,14 @@ function benchmark(models, dtype, batch_sizes, channels, spatial_size) forward(model, input) # compilation train_step(model, input) # compilation - forward_times[i, j] = @belapsed forward($model, $input) # @elapsed - train_step_times[i, j] = @belapsed train_step($model, $input) # @elapsed + # using @belapsed (minimum time) + #= + forward_times[i, j] = @belapsed forward($model, $input) + train_step_times[i, j] = @belapsed train_step($model, $input) + =# + # using median time + forward_times[i, j] = median(@benchmark forward($model, $input)).time / 10^9 + train_step_times[i, j] = median(@benchmark train_step($model, $input)).time / 10^9 end end diff --git a/benchmark_result_julia.csv b/benchmark_result_julia.csv index 13c60f90d..6ca132f8e 100644 --- a/benchmark_result_julia.csv +++ b/benchmark_result_julia.csv @@ -1,8 +1,8 @@ model_names,"acceleration inference, batch_size: 1","acceleration train, batch_size: 1","im2col, inference, batch_size: 1","lv-ext, inference, batch_size: 1","im2col, train, batch_size: 1","lv-ext, train, batch_size: 1","acceleration inference, batch_size: 32","acceleration train, batch_size: 32","im2col, inference, batch_size: 32","lv-ext, inference, batch_size: 32","im2col, train, batch_size: 32","lv-ext, train, batch_size: 32" -ConvNeXt,2.696525688337994,0.8577982223471639,0.3655661,0.1355693,1.0179392,1.1866884,1.1012307960913856,0.9863925898269857,4.8062238,4.364411,14.39549,14.5940776 -DenseNet121,2.8334019239672825,1.5740008937784382,0.1762512,0.0622048,0.7033689,0.4468669,1.263441740346075,0.9985112159016455,2.8772053,2.2772758,13.7427543,13.7632448 -EfficientNet,2.931731906635844,1.3684833239947383,0.1895995,0.0646715,0.7397858,0.5405881,1.0664467750727167,1.1269442586755343,2.4186004,2.2679054,8.6440818,7.6703721 -EfficientNetv2,5.053202171229253,5.590555646664826,0.6446749,0.1275775,12.9539883,2.31712,1.4478919823022125,1.224915348029691,6.3059734,4.3552789,22.0001035,17.9605093 -MobileNetv3,6.416791193464834,1.0744301651818315,0.0515294,0.0080304,0.1731701,0.1611739,1.2257825196180756,1.0673281389045863,0.3963721,0.3233625,1.6299691,1.527149 -ResNet18,1.2245235336423206,1.1195501549857,0.0490358,0.0400448,0.1998762,0.1785326,1.0889384258792703,0.9128196230532792,0.8104152,0.744225,3.1845247,3.4886681 -WideResNet50,0.670115626920713,0.7518492599095661,0.1744445,0.26032,0.7198077,0.957383,0.9693667932236841,0.8498274823283908,4.0205619,4.1476167,15.3664104,18.0817998 +ConvNeXt,3.0131868428607564,1.3994729097036838,0.4240265,0.1407236,1.43802405,1.0275469,1.1620754379865017,1.0632876498150545,4.6366846,3.9900031,14.7683656,13.8893418 +DenseNet121,2.7011062104755816,1.575624841888005,0.1855009,0.0686759,0.7069096,0.4486535,1.2534693725910626,1.036169100624124,2.6923755,2.1479388,12.6862194,12.2433871 +EfficientNet,6.669989485006747,2.4963127103581892,0.49731575,0.0745602,1.33507035,0.53481695,1.121879889261884,1.1298641673853496,2.5537233,2.2762894,8.1940817,7.2522715 +EfficientNetv2,16.28186773870062,7.202334907846903,2.5620854,0.1573582,12.053267,1.67352215,1.4558721609174592,1.203183521905458,6.1329556,4.21256465,21.0444893,17.4906728 +MobileNetv3,12.105678302652656,1.5684538123069776,0.1103481,0.0091154,0.31291775,0.19950715,1.2884028351358188,1.1391237206595466,0.43895395,0.3406962,2.0458146,1.7959547 +ResNet18,1.321074637025202,1.0621579200481972,0.0558948,0.0423101,0.2110332,0.19868345,1.0855325609238786,0.8862720054297211,0.8071219,0.7435262,2.98925695,3.3728437 +WideResNet50,0.6797203960326701,0.7846926795922912,0.1863516,0.2741592,0.6916193,0.88138875,0.8693605563082452,0.7977827693085691,3.68315245,4.23662245,13.5918181,17.0369913 diff --git a/benchmark_result_pytorch.csv b/benchmark_result_pytorch.csv index 5f929352b..153e03686 100644 --- a/benchmark_result_pytorch.csv +++ b/benchmark_result_pytorch.csv @@ -1,8 +1,8 @@ ,model_names,"inference, batch_size: 1","train, batch_size: 1","inference, batch_size: 32","train, batch_size: 32" -0,ConvNeXt,0.0805797,0.2446732,1.598144,4.723875 -1,DenseNet121,0.1164808,0.2522871,1.5435163,4.5821567 -2,EfficientNet,0.0598909,0.1230739,0.9899088,2.6415434 -3,EfficientNetv2,0.13141,0.2856547,1.8736856,5.1560116 -4,MobileNetv3,0.0220434,0.0392312,0.2386686,0.5895018 -5,ResNet18,0.0289765,0.0681748,0.4151833,1.2125939 -6,WideResNet50,0.1647507,0.4650428,2.3093054,6.9086385 +0,ConvNeXt,0.0838077,0.3047408,1.6291157,4.6741033 +1,DenseNet121,0.108188,0.2309508,1.5926049,4.6524887 +2,EfficientNet,0.0661599,0.1544869,0.9558551,2.707274 +3,EfficientNetv2,0.129968,0.2956851,1.830179,5.1837134 +4,MobileNetv3,0.022812,0.0398323,0.2456348,0.5598582 +5,ResNet18,0.0305176,0.074905,0.4218851,1.192504 +6,WideResNet50,0.1575364,0.4871242,2.2386081,6.8065705 From 8dde5f98b133f6cfb6bc1da3909c96200b4ae303 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Sun, 8 Oct 2023 19:34:52 +0200 Subject: [PATCH 33/37] Add files via upload --- Project.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Project.toml b/Project.toml index fe9314f97..14cee5b4f 100644 --- a/Project.toml +++ b/Project.toml @@ -42,7 +42,6 @@ KernelAbstractions = "0.9.2" Requires = "1.0" cuDNN = "1" julia = "1.9" -# LoopVectorization = "=0.12.146" [extras] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" From 55051570e35db58c2e5a1f344c44807b3db6c728 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Sun, 8 Oct 2023 19:35:15 +0200 Subject: [PATCH 34/37] Add files via upload --- .../NNlibLoopVectorizationExt.jl | 5 +- ext/NNlibLoopVectorizationExt/activations.jl | 26 ++ ext/NNlibLoopVectorizationExt/conv.jl | 421 +----------------- ext/NNlibLoopVectorizationExt/pooling.jl | 63 +-- 4 files changed, 83 insertions(+), 432 deletions(-) create mode 100644 ext/NNlibLoopVectorizationExt/activations.jl diff --git a/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl b/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl index 3be6c28a2..6e4b8d8e0 100644 --- a/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl +++ b/ext/NNlibLoopVectorizationExt/NNlibLoopVectorizationExt.jl @@ -5,11 +5,8 @@ using LoopVectorization using Random, Statistics using OffsetArrays, Static -# Bevor die Methoden überladen werden, sollte ein Selfcheck durchgeführt werden, ob die Ergebnisse mit NNlib übereinstimmen. -# Wenn nicht, sollte eine Warnung angezeigt werden und die wenn irgendwie möglich, nur die funktionierenden Methoden verwendet werden. -# Z.b. in dem bei falschem Ergebniss, die im2col Methode als Backend in der überladenen Methode aufgerufen wird. - include("conv.jl") include("pooling.jl") +include("activations.jl") end # module \ No newline at end of file diff --git a/ext/NNlibLoopVectorizationExt/activations.jl b/ext/NNlibLoopVectorizationExt/activations.jl new file mode 100644 index 000000000..8201e7299 --- /dev/null +++ b/ext/NNlibLoopVectorizationExt/activations.jl @@ -0,0 +1,26 @@ +_tanh(x) = tanh(x) +Base.broadcasted(::typeof(tanh), x::AbstractArray) = @turbo _tanh.(x) + +_softsign(x) = x / (1 + abs(x)) +Base.broadcasted(::typeof(NNlib.softsign), x::AbstractArray) = @turbo _softsign.(x) + +_softplus(x) = log1p(exp(-abs(x))) +Base.broadcasted(::typeof(NNlib.softplus), x::AbstractArray) = (@turbo _softplus.(x)) .+ NNlib.relu.(x) + +function _sigmoid(x) + t = exp(-abs(x)) + ifelse(x ≥ 0, inv(1 + t), t / (1 + t)) +end +Base.broadcasted(::typeof(NNlib.sigmoid), x::AbstractArray) = @turbo _sigmoid.(x) +Base.broadcasted(::typeof(NNlib.sigmoid_fast), x::AbstractArray) = @turbo _sigmoid.(x) # don't do the same for tanh_fast, it would be slower + +function _hardsigmoid(x) + clamp((x + 3) / 6, 0, 1) +end +Base.broadcasted(::typeof(NNlib.hardsigmoid), x::AbstractArray) = @turbo _hardsigmoid.(x) + +_logsigmoid(x) = -_softplus(-x) +Base.broadcasted(::typeof(NNlib.logsigmoid), x::AbstractArray) = @turbo _logsigmoid.(x) + +_swish(x) = x * _sigmoid(x) +Base.broadcasted(::typeof(NNlib.swish), x::AbstractArray) = @turbo _swish.(x) \ No newline at end of file diff --git a/ext/NNlibLoopVectorizationExt/conv.jl b/ext/NNlibLoopVectorizationExt/conv.jl index a75026b82..0ca75f466 100644 --- a/ext/NNlibLoopVectorizationExt/conv.jl +++ b/ext/NNlibLoopVectorizationExt/conv.jl @@ -1,9 +1,19 @@ -# pad naivly, static iters -function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real} +#= +Accelerated convolution for 2d-images using the power of LoopVectorization. +The acceleration is usually greatest when the inputs have a large spatial size and few channels. +Using stride > 1, dilation > 1 or groups > 1 can slow down things a bit. + +Since the current state of LoopVectorization ∇conv_filter! isn't really faster than the +original implementation in some situations, it is left out for the moment. + +Implementation for forward pass mostly copied from here (Jonas Steinebach, MIT license): +https://github.com/jonas208/GradValley.jl/blob/main/src/functional/gv_convolution.jl - # if cdims.stride != (1, 1) - # return conv_im2col_grouped!(output, input, weight, cdims) - # end +Implementation for backward pass mostly copied from here (Chris Elrod, MIT license): +https://github.com/PumasAI/SimpleChains.jl/blob/main/src/conv.jl +=# + +function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real} # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise) size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4]) @@ -48,15 +58,11 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, @tturbo for index_batch in 1:batch_size for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width - # m = y_out + static((y_stride - 1)) * (y_out - 1) - # n = x_out + static((x_stride - 1)) * (x_out - 1) m = y_out + (y_stride - 1) * (y_out - 1) n = x_out + (x_stride - 1) * (x_out - 1) value = zero(T) - for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) - # y_in = m + (y_w - 1) - # x_in = n + (x_w - 1) - # value += input[n + (x_w - 1), m + (y_w - 1), in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] + # for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width value += input[n + x_w - 1, m + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] end output[x_out, y_out, out_channel, index_batch] = value @@ -68,14 +74,11 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, @tturbo for index_batch in 1:batch_size for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width - # m = y_out + static((y_stride - 1)) * (y_out - 1) - # n = x_out + static((x_stride - 1)) * (x_out - 1) m = y_out + (y_stride - 1) * (y_out - 1) n = x_out + (x_stride - 1) * (x_out - 1) value = zero(T) - for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) - # y_in = m + (y_w - 1) * y_dilation - # x_in = n + (x_w - 1) * x_dilation + # for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) + for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] end output[x_out, y_out, out_channel, index_batch] = value @@ -86,283 +89,14 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, # println("forward: general case for any convolution") @tturbo for index_batch in 1:batch_size - for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in 1:output_height, x_out in 1:output_width - # m = y_out + static((y_stride - 1)) * (y_out - 1) - # n = x_out + static((x_stride - 1)) * (x_out - 1) - m = y_out + (y_stride - 1) * (y_out - 1) - n = x_out + (x_stride - 1) * (x_out - 1) - out_channel = (group * out_channels_per_group + 1) - out_channel_per_group - value = zero(T) - for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) - # y_in = m + (y_w - 1) * y_dilation - # x_in = n + (x_w - 1) * x_dilation - in_channel_input = in_channel_weight + (group - 1) * in_channels_weight - value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel] - end - output[x_out, y_out, out_channel, index_batch] = value - end - end - - end - - return output -end - -#= -# pad bounds check -function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} - - # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise) - size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4]) - cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel) - NNlib.check_dims(size(input), size_weight_check_dims, size(output), cdims_check_dims) - - output_width, output_height, _ = size(output) - input_width, input_height, in_channels, batch_size = size(input) - weight_width, weight_height, in_channels_weight, out_channels = size(weight) - - # it's necessary to flip the kernel if real convolution is performed (flipkernel=false) - if !NNlib.flipkernel(cdims) - weight = reverse(weight, dims=(1, 2)) - end - - groups = cdims.groupcount - x_stride, y_stride = cdims.stride - x_dilation, y_dilation = cdims.dilation - x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding - out_channels_per_group = out_channels ÷ groups - - # We use calc_padding_regions to split outselves up into separate regions that may or - # may not need to worry about padding: - cdims_3d = DenseConvDims((input_width, input_height, 1, in_channels, batch_size), (weight_width, weight_height, 1, in_channels_weight, out_channels), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1)) - # println(pdims_3d.padding) - padded_regions, central_region = NNlib.calc_padding_regions(cdims_3d) - - # Start with the central region - w_region, h_region, _ = central_region - - if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance - # println("forward: very specialized case for maximum performance") - - @tturbo for index_batch in 1:batch_size - for out_channel in 1:out_channels, y_out in h_region, x_out in w_region - value = zero(T) - for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) - value += input[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] - end - output[x_out, y_out, out_channel, index_batch] = value - end - end - - elseif groups == 1 && cdims.dilation == (1, 1) # second specialized case for better performance - # println("forward: second specialized case for better performance") - - @tturbo for index_batch in 1:batch_size - for out_channel in 1:out_channels, y_out in h_region, x_out in w_region - # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) - # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) - m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1 - n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1 - value = zero(T) - for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) - # y_in = m + (y_w - 1) - # x_in = n + (x_w - 1) - value += input[n + (x_w - 1), m + (y_w - 1), in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] - end - output[x_out, y_out, out_channel, index_batch] = value - end - end - - elseif groups == 1 # third specialized case for better performance - # println("forward: third specialized case for better performance") - - @tturbo for index_batch in 1:batch_size - for out_channel in 1:out_channels, y_out in h_region, x_out in w_region - # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) - # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) - m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1 - n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1 - value = zero(T) - for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) - # y_in = m + (y_w - 1) * y_dilation - # x_in = n + (x_w - 1) * x_dilation - value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] - end - output[x_out, y_out, out_channel, index_batch] = value - end - end - - else # general case for any convolution - # println("forward: general case for any convolution") - - @tturbo for index_batch in 1:batch_size - for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in h_region, x_out in w_region - # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) - # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) - m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1 - n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1 - out_channel = (group * out_channels_per_group + 1) - out_channel_per_group - value = zero(T) - for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) - # y_in = m + (y_w - 1) * y_dilation - # x_in = n + (x_w - 1) * x_dilation - in_channel_input = in_channel_weight + (group - 1) * in_channels_weight - value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel] - end - output[x_out, y_out, out_channel, index_batch] = value - end - end - - end - - # @show w_region # 1:0 warning: when padding is unequal and one of x_pad1 or y_pad1 is 0, emty collections are possible - # @show h_region # 1:0 if isempty(1:0) -> true - # println() - - if cdims.padding != (0, 0, 0, 0) - # Next, the padded regions - for (w_region, h_region, d_region) in padded_regions # @inbounds - for z_out in d_region # for skipping the d_regions - - if cdims.groupcount == 1 - - @tturbo for index_batch in 1:batch_size # @turbo - for out_channel in 1:out_channels, y_out in h_region, x_out in w_region - # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) - # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) - m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1 - n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1 - value = zero(T) - # for in_channel in static(1):static(in_channels), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) - for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width - - y_in = m + (y_w - 1) * y_dilation - x_in = n + (x_w - 1) * x_dilation - - is_in_bound_x = (x_in >= 1) & (x_in <= input_width) - is_in_bound_y = (y_in >= 1) & (y_in <= input_height) - - input_value = (is_in_bound_x & is_in_bound_y) ? input[x_in, y_in, in_channel, index_batch] : zero(T) - value += input_value * weight[x_w, y_w, in_channel, out_channel] - - # value += (ib0 & ib1) ? input[x_in, y_in, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] : zero(T) - - end - output[x_out, y_out, out_channel, index_batch] = value - end - end - - else - - @tturbo for index_batch in 1:batch_size # @turbo - for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in h_region, x_out in w_region - # m = y_out + static((y_stride - 1)) * (y_out - 1) - static(y_pad1) - # n = x_out + static((x_stride - 1)) * (x_out - 1) - static(x_pad1) - m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1 - n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1 - out_channel = (group * out_channels_per_group + 1) - out_channel_per_group - value = zero(T) - # for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) - for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width - - y_in = m + (y_w - 1) * y_dilation - x_in = n + (x_w - 1) * x_dilation - - in_channel_input = in_channel_weight + (group - 1) * in_channels_weight - - is_in_bound_x = (x_in >= 1) & (x_in <= input_width) - is_in_bound_y = (y_in >= 1) & (y_in <= input_height) - - input_value = (is_in_bound_x & is_in_bound_y) ? input[x_in, y_in, in_channel_input, index_batch] : zero(T) - value += input_value * weight[x_w, y_w, in_channel_weight, out_channel] - - # value += (ib0 & ib1) ? input[x_in, y_in, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel] : zero(T) - - end - output[x_out, y_out, out_channel, index_batch] = value - end - end - - end - end - end - end - - return output -end -=# - -#= -function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} - - # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise) - size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4]) - cdims_check_dims = DenseConvDims(size(input), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel) - NNlib.check_dims(size(input), size_weight_check_dims, size(output), cdims_check_dims) - - # padding is done naively at the moment - if cdims.padding != (0, 0, 0, 0) - input = NNlib.pad_zeros(input, cdims.padding, dims=(1, 2)) - end - - output_width, output_height, _ = size(output) - input_width, input_height, in_channels, batches = size(input) - weight_width, weight_height, in_channels_weight, out_channels = size(weight) - - # it's necessary to flip the kernel if real convolution is performed (flipkernel=false) - if !NNlib.flipkernel(cdims) - weight = reverse(weight, dims=(1, 2)) - end - - groups = cdims.groupcount - x_stride, y_stride = cdims.stride - x_dilation, y_dilation = cdims.dilation - out_channels_per_group = out_channels ÷ groups - - if cdims.groupcount == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance - # println("forward: very specialized case for maximum performance") - - @tturbo for index_batch in 1:batches - for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width - value = zero(T) - for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width - value += input[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] - end - output[x_out, y_out, out_channel, index_batch] = value - end - end - - elseif groups == 1 # second specialized case for better performance - # println("forward: second specialized case for better performance") - - @tturbo for index_batch in 1:batches - for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width - m = y_out + (y_stride - 1) * (y_out - 1) - n = x_out + (x_stride - 1) * (x_out - 1) - value = zero(T) - for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width - y_in = m + (y_w - 1) * y_dilation - x_in = n + (x_w - 1) * x_dilation - value += input[x_in, y_in, in_channel, index_batch] * weight[x_w, y_w, in_channel, out_channel] - end - output[x_out, y_out, out_channel, index_batch] = value - end - end - - else # general case for any convolution - # println("forward: general case for any convolution") - - @tturbo for index_batch in 1:batches for group in 1:groups, out_channel_per_group in 1:out_channels_per_group, y_out in 1:output_height, x_out in 1:output_width m = y_out + (y_stride - 1) * (y_out - 1) n = x_out + (x_stride - 1) * (x_out - 1) out_channel = (group * out_channels_per_group + 1) - out_channel_per_group value = zero(T) + # for in_channel_weight in static(1):static(in_channels_weight), y_w in static(1):static(weight_height), x_w in static(1):static(weight_width) for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width - y_in = m + (y_w - 1) * y_dilation - x_in = n + (x_w - 1) * x_dilation - in_channel_input = in_channel_weight + (group - 1) * in_channels_weight - value += input[x_in, y_in, in_channel_input, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel] + value += input[n + (x_w - 1) * x_dilation, m + (y_w - 1) * y_dilation, in_channel_weight + (group - 1) * in_channels_weight, index_batch] * weight[x_w, y_w, in_channel_weight, out_channel] end output[x_out, y_out, out_channel, index_batch] = value end @@ -372,19 +106,6 @@ function NNlib.conv!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, return output end -=# - -function conv_im2col_grouped!(output::Array{T,4}, input::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real} - - conv!( - NNlib.insert_singleton_spatial_dimension(output, 1), - NNlib.insert_singleton_spatial_dimension(input, 1), - NNlib.insert_singleton_spatial_dimension(weight, 1), - NNlib.insert_singleton_spatial_dimension(cdims, 1) - ) - - return output -end function ∇conv_data_im2col_grouped!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real} @@ -398,18 +119,6 @@ function ∇conv_data_im2col_grouped!(input_gradient::Array{T,4}, output_gradien return input_gradient end -function ∇conv_filter_im2col_grouped!(weight_gradient::Array{T,4}, input::Array{T,4}, output_gradient::Array{T,4}, cdims::ConvDims) where {T<:Real} - - ∇conv_filter!( - NNlib.insert_singleton_spatial_dimension(weight_gradient, 1), - NNlib.insert_singleton_spatial_dimension(input, 1), - NNlib.insert_singleton_spatial_dimension(output_gradient, 1), - NNlib.insert_singleton_spatial_dimension(cdims, 1) - ) - - return weight_gradient -end - function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims) where {T<:Real} # fix for groupcount > 1 (NNlib.check_dims would throw an error otherwise) @@ -452,8 +161,8 @@ function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{ for x_in in 0:input_width-1, y_in in 0:input_height-1, in_channel in 0:in_channels-1 value = zero(T) - # for x_w in static(0):static(weight_width-1), y_w in static(0):static(weight_height-1), out_channel in static(0):static(out_channels-1) # RICHTIG!! - for x_w in 0:weight_width-1, y_w in 0:weight_height-1, out_channel in 0:out_channels-1 + for x_w in static(0):static(weight_width-1), y_w in static(0):static(weight_height-1), out_channel in static(0):static(out_channels-1) + # for x_w in 0:weight_width-1, y_w in 0:weight_height-1, out_channel in 0:out_channels-1 is_in_bound_x = (x_in - x_w >= 0) & (x_in - x_w < output_width) is_in_bound_y = (y_in - y_w >= 0) & (y_in - y_w < output_height) @@ -479,86 +188,4 @@ function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{ end return input_gradient -end - -#= -function NNlib.∇conv_data!(input_gradient::Array{T,4}, output_gradient::Array{T,4}, weight::Array{T,4}, cdims::ConvDims; kw...) where {T<:Real} - # println("myconv data back called") - - size_weight_check_dims = (size(weight)[1:2]..., size(weight)[3]*cdims.groupcount, size(weight)[4]) - cdims_check_dims = DenseConvDims(size(input_gradient), size_weight_check_dims, stride=cdims.stride, padding=cdims.padding, dilation=cdims.dilation, groups=1, flipkernel=cdims.flipkernel) - NNlib.check_dims(size(input_gradient), size_weight_check_dims, size(output_gradient), cdims_check_dims) - - # storing all the necessary shapes - output_width, output_height, out_channels, current_batch_size = size(output_gradient) - weight_width, weight_height, in_channels_weight, out_channels = size(weight) - # because in the actual computation section, values are added, it's saver to reset the given input_gradient first - input_gradient .= zero(T) - # check if input_gradient must be padded - if cdims.padding != (0, 0, 0, 0) - input_gradient_padded = NNlib.pad_zeros(input_gradient, cdims.padding, dims=(1, 2)) - else - input_gradient_padded = input_gradient - end - # store the size of input after padding - input_width, input_height, in_channels, current_batch_size = size(input_gradient_padded) # size after padding - - if !NNlib.flipkernel(cdims) - weight = reverse(weight, dims=(1, 2)) - end - - groups = cdims.groupcount - x_stride, y_stride = cdims.stride - x_dilation, y_dilation = cdims.dilation - out_channels_per_group = out_channels ÷ groups - # actual computation - if groups == 1 && cdims.stride == (1, 1) && cdims.dilation == (1, 1) # very specialized case for maximum performance - # println("very specialized case for maximum performance") - Threads.@threads for index_batch in 1:current_batch_size - @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width - for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width - input_gradient_padded[x_out + x_w - 1, y_out + y_w - 1, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] - end - end - end - elseif groups == 1 # second specialized case for better performance - # println("second specialized case for better performance") - Threads.@threads for index_batch in 1:current_batch_size - @turbo for out_channel in 1:out_channels, y_out in 1:output_height, x_out in 1:output_width - m = y_out + (y_stride - 1) * (y_out - 1) - n = x_out + (x_stride - 1) * (x_out - 1) - for in_channel in 1:in_channels, y_w in 1:weight_height, x_w in 1:weight_width - y_in = m + (y_w - 1) * y_dilation - x_in = n + (x_w - 1) * x_dilation - input_gradient_padded[x_in, y_in, in_channel, index_batch] += weight[x_w, y_w, in_channel, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] - end - end - end - else # general case for any convolution - # println("general case for any convolution") - Threads.@threads for index_batch in 1:current_batch_size - for out_channel_per_group in 1:out_channels_per_group # putting @turbo here may end up in wrong results - @turbo for group in 1:groups, y_out in 1:output_height, x_out in 1:output_width - m = y_out + (y_stride - 1) * (y_out - 1) - n = x_out + (x_stride - 1) * (x_out - 1) - out_channel = (group * out_channels_per_group + 1) - out_channel_per_group - for in_channel_weight in 1:in_channels_weight, y_w in 1:weight_height, x_w in 1:weight_width - y_in = m + (y_w - 1) * y_dilation - x_in = n + (x_w - 1) * x_dilation - in_channel_input = in_channel_weight + (group - 1) * in_channels_weight - input_gradient_padded[x_in, y_in, in_channel_input, index_batch] += weight[x_w, y_w, in_channel_weight, out_channel] * output_gradient[x_out, y_out, out_channel, index_batch] - end - end - end - end - end - - # depad - if cdims.padding != (0, 0, 0, 0) - x_pad1, x_pad2, y_pad1, y_pad2 = cdims.padding - input_gradient .= input_gradient_padded[x_pad1+1:input_width-x_pad2, y_pad1+1:input_height-y_pad2, :, :] - end - - return input_gradient -end -=# \ No newline at end of file +end \ No newline at end of file diff --git a/ext/NNlibLoopVectorizationExt/pooling.jl b/ext/NNlibLoopVectorizationExt/pooling.jl index 420bb7413..5af194408 100644 --- a/ext/NNlibLoopVectorizationExt/pooling.jl +++ b/ext/NNlibLoopVectorizationExt/pooling.jl @@ -13,8 +13,8 @@ function NNlib.meanpool!(output::Array{T,4}, input::Array{T,4}, pdims::PoolDims; NNlib.check_dims(size(input), size(output), pdims) # storing all the necessary shapes - input_width, input_height, channels, current_batch_size = size(input) - output_width, output_height, channels, current_batch_size = size(output) + input_width, input_height, channels, batch_size = size(input) + output_width, output_height, channels, batch_size = size(output) kernel_width, kernel_height = pdims.kernel_size x_stride, y_stride = pdims.stride @@ -26,7 +26,7 @@ function NNlib.meanpool!(output::Array{T,4}, input::Array{T,4}, pdims::PoolDims; # We use calc_padding_regions to split outselves up into separate regions that may or # may not need to worry about padding: - pdims_3d = PoolDims((input_width, input_height, 1, channels, current_batch_size), (kernel_width, kernel_height, 1), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1)) + pdims_3d = PoolDims((input_width, input_height, 1, channels, batch_size), (kernel_width, kernel_height, 1), stride=(x_stride, y_stride, 1), padding=(x_pad1, x_pad2, y_pad1, y_pad2, 0, 0), dilation=(x_dilation, y_dilation, 1)) # println(pdims_3d.padding) padded_regions, central_region = NNlib.calc_padding_regions(pdims_3d) @@ -40,12 +40,11 @@ function NNlib.meanpool!(output::Array{T,4}, input::Array{T,4}, pdims::PoolDims; if pdims.stride == (1, 1) && pdims.dilation == (1, 1) # specialized case for better performance # println("specialized case for better performance") - @tturbo for index_batch in 1:current_batch_size + @tturbo for index_batch in 1:batch_size # compute pooling for each channel separatly for channel in 1:channels, y_out in h_region, x_out in w_region kernel_sum = zero(T) for y_w in 1:kernel_height, x_w in 1:kernel_width - # kernel_sum += input[x_out + x_w - 1, y_out + y_w - 1, channel, index_batch] kernel_sum += input[x_out + x_w - 1 - x_pad1, y_out + y_w - 1 - y_pad1, channel, index_batch] end output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha @@ -55,52 +54,54 @@ function NNlib.meanpool!(output::Array{T,4}, input::Array{T,4}, pdims::PoolDims; else # general case for any meanpooling # println("general case for any meanpooling") - @tturbo for index_batch in 1:current_batch_size + @tturbo for index_batch in 1:batch_size # compute pooling for each channel separatly for channel in 1:channels, y_out in h_region, x_out in w_region m = y_out + (y_stride - 1) * (y_out - 1) - y_pad1 n = x_out + (x_stride - 1) * (x_out - 1) - x_pad1 kernel_sum = zero(T) for y_w in 1:kernel_height, x_w in 1:kernel_width - y_in = m + (y_w - 1) * y_dilation # - y_pad1 - x_in = n + (x_w - 1) * x_dilation # - x_pad1 + y_in = m + (y_w - 1) * y_dilation + x_in = n + (x_w - 1) * x_dilation kernel_sum += input[x_in, y_in, channel, index_batch] end output[x_out, y_out, channel, index_batch] = kernel_sum * _alpha end end - end + end # Next, the padded regions - @inbounds for (w_region, h_region, d_region) in padded_regions - for index_batch in 1:current_batch_size, channel in 1:channels - for d in d_region # for skipping the d_regions - for h in h_region - ph = project(h, y_stride, y_pad1) - for w in w_region - pw = project(w, x_stride, x_pad1) - m = zero(T) - - for kh in 1:kernel_height - input_kh = ph + (kh - 1) * y_dilation - if input_kh <= 0 || input_kh > input_height - continue - end + if pdims.padding != (0, 0, 0, 0) + @inbounds for (w_region, h_region, d_region) in padded_regions + for index_batch in 1:batch_size, channel in 1:channels + for z_out in d_region # for skipping the d_regions + for y_out in h_region + m = project(y_out, y_stride, y_pad1) + for x_out in w_region + n = project(x_out, x_stride, x_pad1) + kernel_sum = zero(T) - for kw in 1:kernel_width - input_kw = pw + (kw - 1) * x_dilation - if input_kw <= 0 || input_kw > input_width + for y_w in 1:kernel_height + y_in = m + (y_w - 1) * y_dilation + if y_in <= 0 || y_in > input_height continue end - m += input[input_kw, input_kh, channel, index_batch] + for x_w in 1:kernel_width + x_in = n + (x_w - 1) * x_dilation + if x_in <= 0 || x_in > input_width + continue + end + + kernel_sum += input[x_in, y_in, channel, index_batch] + end end - end - output[w, h, channel, index_batch] = _alpha * m - end - end + output[x_out, y_out, channel, index_batch] = _alpha * kernel_sum + end + end + end end end end From 0aa3a3fe1e727f68fda6013c69b5e5139abdd97d Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Sun, 8 Oct 2023 19:36:37 +0200 Subject: [PATCH 35/37] Add files via upload From 35f2b772944ba594aa93483691501c79cf65808c Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Sun, 8 Oct 2023 19:36:53 +0200 Subject: [PATCH 36/37] Add files via upload --- test/ext_loopvectorization/runtests.jl | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/test/ext_loopvectorization/runtests.jl b/test/ext_loopvectorization/runtests.jl index 262ae42d0..f3cc771c8 100644 --- a/test/ext_loopvectorization/runtests.jl +++ b/test/ext_loopvectorization/runtests.jl @@ -32,7 +32,7 @@ function compute_pool_outputs(settings::Vector{<:NNlib.PoolDims}, input::Array{T return pool_outs end -@testset "Convolution & Pooling" begin +@testset "Convolution & Pooling & Activations" begin dtype = Float32 # Float64 batch_size = 64 # 1 # 64 # 32 @@ -44,34 +44,44 @@ end NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1), # test 'very specialized case' NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(2, 1), padding=(0, 0), dilation=(1, 1), groups=1), # test 'second specialized case' NNlib.DenseConvDims(size(input), size(weight_ungrouped), stride=(2, 1), padding=(0, 0), dilation=(2, 1), groups=1), # test 'third specialized case' - ### NNlib.DenseConvDims(size(input), size(weight_grouped), stride=(2, 1), padding=(2, 0), dilation=(2, 1), groups=3), # test 'general case' - NNlib.DenseConvDims(size(input), size(weight_grouped), stride=(2, 1), padding=(2, 1), dilation=(2, 1), groups=3), # test 'general case' + NNlib.DenseConvDims(size(input), size(weight_grouped), stride=(2, 1), padding=(2, 0), dilation=(2, 1), groups=3), # test 'general case' ] conv_output_grads = [rand(dtype, NNlib.output_size(setting)..., 27, batch_size) for setting in conv_settings_list] pool_settings_list = [ NNlib.PoolDims(size(input), (5, 4), stride=(1, 1), padding=(0, 0), dilation=(1, 1)), # test 'specialized case' - # NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 0), dilation=(2, 1)), # test 'general case' - NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 1), dilation=(2, 1)), # test 'general case' + NNlib.PoolDims(size(input), (5, 4), stride=(5, 4), padding=(2, 0), dilation=(2, 1)), # test 'general case' ] + modified_activations = [tanh, softsign, softplus, sigmoid, sigmoid_fast, hardsigmoid, logsigmoid, swish] + # compute outputs before loading LoopVectorization println("without LoopVectorization") conv_outs_std, conv_grads_std = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads) pool_outs_std = compute_pool_outputs(pool_settings_list, input) + act_outs_std = [@btime $act.($input) for act in modified_activations] using LoopVectorization # now load the NNlibLoopVectorizationExt println("with LoopVectorization") conv_outs_lv, conv_grads_lv = compute_conv_outputs(conv_settings_list, input, weight_ungrouped, weight_grouped, conv_output_grads) pool_outs_lv = compute_pool_outputs(pool_settings_list, input) + act_outs_lv = [@btime $act.($input) for act in modified_activations] # validate conv - @test all(isapprox.(conv_outs_std, conv_outs_lv)) - @test all(isapprox.(conv_grads_std, conv_grads_lv)) + @testset "Convolution" begin + @test all(isapprox.(conv_outs_std, conv_outs_lv)) + @test all(isapprox.(conv_grads_std, conv_grads_lv)) + end # validate pool - @test all(isapprox.(pool_outs_std, pool_outs_lv)) + @testset "Pooling" begin + @test all(isapprox.(pool_outs_std, pool_outs_lv)) + end + # validate activations + @testset "Activations" begin + @test all(isapprox.(act_outs_std, act_outs_lv)) + end end \ No newline at end of file From 07943d7213f22dba49b44c8c2762117e6abd1e58 Mon Sep 17 00:00:00 2001 From: Jonas S <74866941+jonas208@users.noreply.github.com> Date: Mon, 9 Oct 2023 11:49:46 +0200 Subject: [PATCH 37/37] Add files via upload --- benchmark_result_julia_BLAS.set_num_threads(1).csv | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 benchmark_result_julia_BLAS.set_num_threads(1).csv diff --git a/benchmark_result_julia_BLAS.set_num_threads(1).csv b/benchmark_result_julia_BLAS.set_num_threads(1).csv new file mode 100644 index 000000000..8f84f3c6d --- /dev/null +++ b/benchmark_result_julia_BLAS.set_num_threads(1).csv @@ -0,0 +1,8 @@ +model_names,"acceleration inference, batch_size: 1","acceleration train, batch_size: 1","im2col, inference, batch_size: 1","lv-ext, inference, batch_size: 1","im2col, train, batch_size: 1","lv-ext, train, batch_size: 1","acceleration inference, batch_size: 32","acceleration train, batch_size: 32","im2col, inference, batch_size: 32","lv-ext, inference, batch_size: 32","im2col, train, batch_size: 32","lv-ext, train, batch_size: 32" +ConvNeXt,2.3560260677286187,1.177474414020855,0.4724393,0.2005238,1.5661168,1.3300644,1.1190087278994194,1.0605556431775283,6.1761583,5.519312,20.2704419,19.1130395 +DenseNet121,3.122060107720349,1.6430011190729519,0.2130831,0.0682508,0.7872384,0.4791466,1.0415097744527333,0.9663276503256204,2.2208416,2.1323291,13.0980777,13.5544892 +EfficientNet,7.620280694206955,2.614653451310472,0.55973705,0.0734536,1.3205672,0.50506395,1.0814317166163767,1.4227466786836107,2.3782218,2.1991419,7.8866889,5.543284 +EfficientNetv2,19.58985619614861,6.567161064233096,3.0016087,0.1532226,12.1968172,1.8572435,1.393368850971046,1.4053620350129883,5.7712793,4.1419609,21.7360648,15.4665234 +MobileNetv3,11.880463635205707,1.4356279307443949,0.1165919,0.00981375,0.31690295,0.2207417,1.2244903507960017,1.0962253667265058,0.4233406,0.345728,1.9161035,1.7479102 +ResNet18,1.8553921481455138,1.1963831276674246,0.0782507,0.04217475,0.26524125,0.2217026,0.7610563817484197,0.8134254105937594,0.5683152,0.7467452,3.1617936,3.8870111 +WideResNet50,1.244074585655623,1.0739855756563101,0.33919395,0.2726476,1.0936016,1.0182647,0.625895474130196,0.8091050354680169,2.59841495,4.1515158,16.8332114,20.8047295