From db0eda84fb5e3bf7d732c408476208dfe03944b1 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Fri, 4 Nov 2022 00:36:44 +0530 Subject: [PATCH 01/12] added saxpy working version - Float32 --- deps/onemkl.cpp | 3 +++ deps/onemkl.h | 2 ++ lib/mkl/libonemkl.jl | 4 ++++ lib/mkl/linalg.jl | 5 +++++ lib/mkl/wrappers.jl | 22 ++++++++++++++++++++-- test/onemkl.jl | 22 ++++++++++++++++++++++ 6 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 test/onemkl.jl diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index ba9654ba..77b3f91d 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -81,6 +81,9 @@ extern "C" int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, return 0; } +extern "C" void onemklSaxpy(syclQueue_t device_queue, int64_t n, float alpha, const float *x, std::int64_t incx, float *y, int64_t incy) { + oneapi::mkl::blas::column_major::axpy(device_queue->val, n, alpha, x, incx, y, incy); +} // other diff --git a/deps/onemkl.h b/deps/onemkl.h index 7e7e065b..ca48ca1e 100644 --- a/deps/onemkl.h +++ b/deps/onemkl.h @@ -39,6 +39,8 @@ int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, const double _Complex *B, int64_t ldb, double _Complex beta, double _Complex *C, int64_t ldc); +void onemklSaxpy(syclQueue_t device_queue, int64_t n, float alpha, const float *x, int64_t incx, float *y, int64_t incy); + void onemklDestroy(); #ifdef __cplusplus } diff --git a/lib/mkl/libonemkl.jl b/lib/mkl/libonemkl.jl index 042b5e3a..f2ac28dd 100644 --- a/lib/mkl/libonemkl.jl +++ b/lib/mkl/libonemkl.jl @@ -41,3 +41,7 @@ function onemklZgemm(device_queue, transA, transB, m, n, k, alpha, A, lda, B, ld B::ZePtr{ComplexF64}, ldb::Int64, beta::ComplexF64, C::ZePtr{ComplexF64}, ldc::Int64)::Cint end + +function onemklSaxpy(device_queue, n, alpha, x, incx, y, incy) + @ccall liboneapi_support.onemklSaxpy(device_queue::syclQueue_t, n::Int64, alpha::Cfloat, x::ZePtr{Cfloat}, incx::Int64, y::ZePtr{Cfloat}, incy::Int64)::Cvoid +end diff --git a/lib/mkl/linalg.jl b/lib/mkl/linalg.jl index d1d6ae6b..048a180b 100644 --- a/lib/mkl/linalg.jl +++ b/lib/mkl/linalg.jl @@ -49,6 +49,11 @@ function gemm_dispatch!(C::oneStridedVecOrMat, A, B, alpha::Number=true, beta::N end end +function LinearAlgebra.axpy!(alpha::Number, x::oneStridedVecOrMat{<:onemklFloat}, y::oneStridedVecOrMat{<:onemklFloat}) where T<:Union{Float16, ComplexF16, onemklFloat} + length(x)==length(y) || throw(DimensionMismatch("axpy arguments have lengths $(length(x)) and $(length(y))")) + oneMKL.axpy!(length(x), alpha, x, y) +end + for NT in (Number, Real) # NOTE: alpha/beta also ::Real to avoid ambiguities with certain Base methods @eval begin diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index 64da7c37..f9244e51 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -14,8 +14,26 @@ function Base.convert(::Type{onemklTranspose}, trans::Char) end end - - +# level 1 +## axpy +for (fname, elty) in + ((:onemklDaxpy,:Float64), + (:onemklSaxpy,:Float32), + (:onemklZaxpy,:ComplexF64), + (:onemklCaxpy,:ComplexF32)) + @eval begin + function axpy!(n::Integer, + alpha::Number, + x::StridedArray{$elty}, + y::StridedArray{$elty} + ) + queue = global_queue(context(x), device(x)) + alpha = $elty(alpha) + $fname(sycl_queue(queue), n, alpha, x, stride(x,1), y, stride(y,1)) + y + end + end +end # # BLAS # diff --git a/test/onemkl.jl b/test/onemkl.jl new file mode 100644 index 00000000..aeb9044a --- /dev/null +++ b/test/onemkl.jl @@ -0,0 +1,22 @@ +using oneAPI +using oneAPI.oneMKL +using LinearAlgebra + +m = 20 +n = 35 +k = 13 + +##### +@testset "level 1" begin + @testset for T in eltypes + if T === Float32 + A = rand(T,m) + B = rand(T, m) + #gpuA = oneArray(A) + #gpuB = oneArray{T}(undef, m) + alpha = rand() + #oneMKL.axpy!(m, alpha, gpuA, gpuB) + @test testf(axpy!, alpha, A, B) + end + end +end From 334bab2bc9db3a86afc181e8e686b2e7b46ffe7d Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Fri, 4 Nov 2022 00:56:50 +0530 Subject: [PATCH 02/12] enabled C & D types --- deps/onemkl.cpp | 8 ++++++++ deps/onemkl.h | 3 +++ lib/mkl/libonemkl.jl | 8 ++++++++ lib/mkl/oneMKL.jl | 2 +- test/onemkl.jl | 2 +- 5 files changed, 21 insertions(+), 2 deletions(-) diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index 77b3f91d..45ff9627 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -85,6 +85,14 @@ extern "C" void onemklSaxpy(syclQueue_t device_queue, int64_t n, float alpha, co oneapi::mkl::blas::column_major::axpy(device_queue->val, n, alpha, x, incx, y, incy); } +extern "C" void onemklDaxpy(syclQueue_t device_queue, int64_t n, double alpha, const double *x, std::int64_t incx, double *y, int64_t incy) { + oneapi::mkl::blas::column_major::axpy(device_queue->val, n, alpha, x, incx, y, incy); +} + +extern "C" void onemklCaxpy(syclQueue_t device_queue, int64_t n, float _Complex alpha, const float _Complex *x, std::int64_t incx, float _Complex *y, int64_t incy) { + oneapi::mkl::blas::column_major::axpy(device_queue->val, n, alpha, reinterpret_cast *>(x), incx, reinterpret_cast *>(y), incy); +} + // other // oneMKL keeps a cache of SYCL queues and tries to destroy them when unloading the library. diff --git a/deps/onemkl.h b/deps/onemkl.h index ca48ca1e..ce43452e 100644 --- a/deps/onemkl.h +++ b/deps/onemkl.h @@ -40,6 +40,9 @@ int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, double _Complex *C, int64_t ldc); void onemklSaxpy(syclQueue_t device_queue, int64_t n, float alpha, const float *x, int64_t incx, float *y, int64_t incy); +void onemklDaxpy(syclQueue_t device_queue, int64_t n, double alpha, const double *x, int64_t incx, double *y, int64_t incy); +void onemklCaxpy(syclQueue_t device_queue, int64_t n, float _Complex alpha, const float _Complex *x, int64_t incx, float _Complex *y, int64_t incy); + void onemklDestroy(); #ifdef __cplusplus diff --git a/lib/mkl/libonemkl.jl b/lib/mkl/libonemkl.jl index f2ac28dd..9b481f20 100644 --- a/lib/mkl/libonemkl.jl +++ b/lib/mkl/libonemkl.jl @@ -45,3 +45,11 @@ end function onemklSaxpy(device_queue, n, alpha, x, incx, y, incy) @ccall liboneapi_support.onemklSaxpy(device_queue::syclQueue_t, n::Int64, alpha::Cfloat, x::ZePtr{Cfloat}, incx::Int64, y::ZePtr{Cfloat}, incy::Int64)::Cvoid end + +function onemklDaxpy(device_queue, n, alpha, x, incx, y, incy) + @ccall liboneapi_support.onemklDaxpy(device_queue::syclQueue_t, n::Int64, alpha::Cdouble, x::ZePtr{Cdouble}, incx::Int64, y::ZePtr{Cdouble}, incy::Int64)::Cvoid +end + +function onemklCaxpy(device_queue, n, alpha, x, incx, y, incy) + @ccall liboneapi_support.onemklCaxpy(device_queue::syclQueue_t, n::Int64, alpha::ComplexF32, x::ZePtr{ComplexF32}, incx::Int64, y::ZePtr{ComplexF32}, incy::Int64)::Cvoid +end diff --git a/lib/mkl/oneMKL.jl b/lib/mkl/oneMKL.jl index d83f2141..41dfbea0 100644 --- a/lib/mkl/oneMKL.jl +++ b/lib/mkl/oneMKL.jl @@ -12,7 +12,7 @@ using GPUArrays include("libonemkl.jl") -const onemklFloat = Union{Float64,Float32,Float16,ComplexF64,ComplexF32} +const onemklFloat = Union{Float64,Float32,ComplexF64,ComplexF32} include("wrappers.jl") include("linalg.jl") diff --git a/test/onemkl.jl b/test/onemkl.jl index aeb9044a..33839fb3 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -9,7 +9,7 @@ k = 13 ##### @testset "level 1" begin @testset for T in eltypes - if T === Float32 + if T <:oneMKL.onemklFloat A = rand(T,m) B = rand(T, m) #gpuA = oneArray(A) From 1a08c67d739ab3d91d3fadd29b0a5d2cc8a01319 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Fri, 4 Nov 2022 01:00:16 +0530 Subject: [PATCH 03/12] supported Z type --- deps/onemkl.cpp | 4 ++++ deps/onemkl.h | 2 +- lib/mkl/libonemkl.jl | 4 ++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index 45ff9627..282ed1fc 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -93,6 +93,10 @@ extern "C" void onemklCaxpy(syclQueue_t device_queue, int64_t n, float _Complex oneapi::mkl::blas::column_major::axpy(device_queue->val, n, alpha, reinterpret_cast *>(x), incx, reinterpret_cast *>(y), incy); } +extern "C" void onemklZaxpy(syclQueue_t device_queue, int64_t n, double _Complex alpha, const double _Complex *x, std::int64_t incx, double _Complex *y, int64_t incy) { + oneapi::mkl::blas::column_major::axpy(device_queue->val, n, alpha, reinterpret_cast *>(x), incx, reinterpret_cast *>(y), incy); +} + // other // oneMKL keeps a cache of SYCL queues and tries to destroy them when unloading the library. diff --git a/deps/onemkl.h b/deps/onemkl.h index ce43452e..9c9f1bf8 100644 --- a/deps/onemkl.h +++ b/deps/onemkl.h @@ -42,7 +42,7 @@ int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, void onemklSaxpy(syclQueue_t device_queue, int64_t n, float alpha, const float *x, int64_t incx, float *y, int64_t incy); void onemklDaxpy(syclQueue_t device_queue, int64_t n, double alpha, const double *x, int64_t incx, double *y, int64_t incy); void onemklCaxpy(syclQueue_t device_queue, int64_t n, float _Complex alpha, const float _Complex *x, int64_t incx, float _Complex *y, int64_t incy); - +void onemklZaxpy(syclQueue_t device_queue, int64_t n, double _Complex alpha, const double _Complex *x, int64_t incx, double _Complex *y, int64_t incy); void onemklDestroy(); #ifdef __cplusplus diff --git a/lib/mkl/libonemkl.jl b/lib/mkl/libonemkl.jl index 9b481f20..0ef674f0 100644 --- a/lib/mkl/libonemkl.jl +++ b/lib/mkl/libonemkl.jl @@ -53,3 +53,7 @@ end function onemklCaxpy(device_queue, n, alpha, x, incx, y, incy) @ccall liboneapi_support.onemklCaxpy(device_queue::syclQueue_t, n::Int64, alpha::ComplexF32, x::ZePtr{ComplexF32}, incx::Int64, y::ZePtr{ComplexF32}, incy::Int64)::Cvoid end + +function onemklZaxpy(device_queue, n, alpha, x, incx, y, incy) + @ccall liboneapi_support.onemklZaxpy(device_queue::syclQueue_t, n::Int64, alpha::ComplexF64, x::ZePtr{ComplexF64}, incx::Int64, y::ZePtr{ComplexF64}, incy::Int64)::Cvoid +end From af0624d9489e41ad1aa5deda7918d5e782c09844 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Fri, 4 Nov 2022 01:04:13 +0530 Subject: [PATCH 04/12] NITS --- test/onemkl.jl | 3 --- 1 file changed, 3 deletions(-) diff --git a/test/onemkl.jl b/test/onemkl.jl index 33839fb3..8d6ece79 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -12,10 +12,7 @@ k = 13 if T <:oneMKL.onemklFloat A = rand(T,m) B = rand(T, m) - #gpuA = oneArray(A) - #gpuB = oneArray{T}(undef, m) alpha = rand() - #oneMKL.axpy!(m, alpha, gpuA, gpuB) @test testf(axpy!, alpha, A, B) end end From e9457008c3f56425b0a5ab775cc478c2ee354bed Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Fri, 4 Nov 2022 09:57:00 +0530 Subject: [PATCH 05/12] 1. oneStrided array used 2. remove half types in check --- lib/mkl/linalg.jl | 2 +- lib/mkl/wrappers.jl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/mkl/linalg.jl b/lib/mkl/linalg.jl index 048a180b..a3ae80c5 100644 --- a/lib/mkl/linalg.jl +++ b/lib/mkl/linalg.jl @@ -49,7 +49,7 @@ function gemm_dispatch!(C::oneStridedVecOrMat, A, B, alpha::Number=true, beta::N end end -function LinearAlgebra.axpy!(alpha::Number, x::oneStridedVecOrMat{<:onemklFloat}, y::oneStridedVecOrMat{<:onemklFloat}) where T<:Union{Float16, ComplexF16, onemklFloat} +function LinearAlgebra.axpy!(alpha::Number, x::oneStridedVecOrMat{<:onemklFloat}, y::oneStridedVecOrMat{<:onemklFloat}) where T<:Union{onemklFloat} length(x)==length(y) || throw(DimensionMismatch("axpy arguments have lengths $(length(x)) and $(length(y))")) oneMKL.axpy!(length(x), alpha, x, y) end diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index f9244e51..32856aab 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -24,8 +24,8 @@ for (fname, elty) in @eval begin function axpy!(n::Integer, alpha::Number, - x::StridedArray{$elty}, - y::StridedArray{$elty} + x::oneStridedArray{$elty}, + y::oneStridedArray{$elty} ) queue = global_queue(context(x), device(x)) alpha = $elty(alpha) From c5fbd0d42d991dcddf3424c654693ef9bf79422f Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 7 Nov 2022 15:28:58 +0530 Subject: [PATCH 06/12] fixes to conflicts --- deps/onemkl.cpp | 1 - test/onemkl.jl | 17 +++-------------- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index 9277639b..3a952cd0 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -118,7 +118,6 @@ extern "C" void onemklCcopy(syclQueue_t device_queue, int64_t n, const float _Co oneapi::mkl::blas::column_major::copy(device_queue->val, n, reinterpret_cast *>(x), incx, reinterpret_cast *>(y), incy); ->>>>>>> master } // other diff --git a/test/onemkl.jl b/test/onemkl.jl index ef16b2e1..0f1e2594 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -3,25 +3,12 @@ using oneAPI.oneMKL <<<<<<< HEAD ======= ->>>>>>> master using LinearAlgebra m = 20 n = 35 k = 13 -<<<<<<< HEAD -##### -@testset "level 1" begin - @testset for T in eltypes - if T <:oneMKL.onemklFloat - A = rand(T,m) - B = rand(T, m) - alpha = rand() - @test testf(axpy!, alpha, A, B) - end - end -======= ############################################################################################ @testset "level 1" begin @testset for T in intersect(eltypes, [Float32, Float64, ComplexF32, ComplexF64]) @@ -29,6 +16,8 @@ k = 13 B = oneArray{T}(undef, m) oneMKL.copy!(m,A,B) @test Array(A) == Array(B) + + alpha = rand() + @test testf(axpy!, alpha, rand(T,m), rand(T,m)) end # level 1 testset ->>>>>>> master end From 680353f775ff84eec43ab9d58dc267bdf1dd26a7 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 7 Nov 2022 15:32:34 +0530 Subject: [PATCH 07/12] revert onemkl lib f16 --- lib/mkl/oneMKL.jl | 2 +- test/onemkl.jl | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/mkl/oneMKL.jl b/lib/mkl/oneMKL.jl index 41dfbea0..d83f2141 100644 --- a/lib/mkl/oneMKL.jl +++ b/lib/mkl/oneMKL.jl @@ -12,7 +12,7 @@ using GPUArrays include("libonemkl.jl") -const onemklFloat = Union{Float64,Float32,ComplexF64,ComplexF32} +const onemklFloat = Union{Float64,Float32,Float16,ComplexF64,ComplexF32} include("wrappers.jl") include("linalg.jl") diff --git a/test/onemkl.jl b/test/onemkl.jl index 0f1e2594..d0da3ede 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -1,7 +1,5 @@ using oneAPI using oneAPI.oneMKL -<<<<<<< HEAD -======= using LinearAlgebra From e286b63c3891b4faf218d70799b3a878c2d0e30a Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 7 Nov 2022 15:39:01 +0530 Subject: [PATCH 08/12] NITS --- deps/onemkl.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index 3a952cd0..49e0a321 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -95,6 +95,7 @@ extern "C" void onemklCaxpy(syclQueue_t device_queue, int64_t n, float _Complex extern "C" void onemklZaxpy(syclQueue_t device_queue, int64_t n, double _Complex alpha, const double _Complex *x, std::int64_t incx, double _Complex *y, int64_t incy) { oneapi::mkl::blas::column_major::axpy(device_queue->val, n, alpha, reinterpret_cast *>(x), incx, reinterpret_cast *>(y), incy); +} extern "C" void onemklDcopy(syclQueue_t device_queue, int64_t n, const double *x, int64_t incx, double *y, int64_t incy) { From 28b2420a84e4e7936c84509a4705cb3bf4e3e1b1 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 7 Nov 2022 15:44:36 +0530 Subject: [PATCH 09/12] NITS --- lib/mkl/wrappers.jl | 2 +- test/onemkl.jl | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index 65ae6911..010517a7 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -15,7 +15,7 @@ function Base.convert(::Type{onemklTranspose}, trans::Char) end # level 1 -## axpy +## axpy primitive for (fname, elty) in ((:onemklDaxpy,:Float64), (:onemklSaxpy,:Float32), diff --git a/test/onemkl.jl b/test/onemkl.jl index d0da3ede..61fa930a 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -15,6 +15,7 @@ k = 13 oneMKL.copy!(m,A,B) @test Array(A) == Array(B) + # Test axpy primitive alpha = rand() @test testf(axpy!, alpha, rand(T,m), rand(T,m)) end # level 1 testset From f5a6121a5797dad15b458a2991956df01015e555 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 7 Nov 2022 16:24:54 +0530 Subject: [PATCH 10/12] T enabled --- test/onemkl.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/onemkl.jl b/test/onemkl.jl index 61fa930a..f56fe815 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -16,7 +16,7 @@ k = 13 @test Array(A) == Array(B) # Test axpy primitive - alpha = rand() - @test testf(axpy!, alpha, rand(T,m), rand(T,m)) + alpha = rand(T,1) + @test testf(axpy!, alpha[1], rand(T,m), rand(T,m)) end # level 1 testset end From 3a13c9fb8d4331542c8046693cf9a34037f7c4f6 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Tue, 8 Nov 2022 13:28:11 +0530 Subject: [PATCH 11/12] use testset for copy/axpy --- test/onemkl.jl | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/test/onemkl.jl b/test/onemkl.jl index f56fe815..723e190a 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -10,13 +10,17 @@ k = 13 ############################################################################################ @testset "level 1" begin @testset for T in intersect(eltypes, [Float32, Float64, ComplexF32, ComplexF64]) - A = oneArray(rand(T, m)) - B = oneArray{T}(undef, m) - oneMKL.copy!(m,A,B) - @test Array(A) == Array(B) - - # Test axpy primitive - alpha = rand(T,1) - @test testf(axpy!, alpha[1], rand(T,m), rand(T,m)) + @testset "copy" begin + A = oneArray(rand(T, m)) + B = oneArray{T}(undef, m) + oneMKL.copy!(m,A,B) + @test Array(A) == Array(B) + end + + @testset "axpy" begin + # Test axpy primitive + alpha = rand(T,1) + @test testf(axpy!, alpha[1], rand(T,m), rand(T,m)) + end end # level 1 testset end From bdd43a37369a1b9a9fcc70fb8f9e7b0e2c9f464c Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Tue, 8 Nov 2022 13:29:16 +0530 Subject: [PATCH 12/12] NITS --- test/onemkl.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/onemkl.jl b/test/onemkl.jl index 723e190a..e2ab833d 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -15,7 +15,7 @@ k = 13 B = oneArray{T}(undef, m) oneMKL.copy!(m,A,B) @test Array(A) == Array(B) - end + end @testset "axpy" begin # Test axpy primitive