diff --git a/nntrainer/tensor/cl_operations/blas_kernels.cpp b/nntrainer/tensor/cl_operations/blas_kernels.cpp index 6c7751b8b0..b64a83f545 100644 --- a/nntrainer/tensor/cl_operations/blas_kernels.cpp +++ b/nntrainer/tensor/cl_operations/blas_kernels.cpp @@ -120,31 +120,26 @@ float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1) { size_t dim1_size = sizeof(float) * dim1; - opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true, - nullptr); - - opencl::Buffer inputX(cl_context_ref.context_inst_, dim1_size, true, - nullptr); - - opencl::Buffer dotResult(cl_context_ref.context_inst_, sizeof(float), true, - &cl_ret); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, vecAdata); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1_size, vecAdata); if (!result) { break; } - result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); + result = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1_size, vecXdata); if (!result) { break; } - result = kernel_dot_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_dot_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_dot_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); + result = kernel_dot_ptr->SetKernelArguments( + 1, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!result) { break; } @@ -154,7 +149,8 @@ float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1) { break; } - result = kernel_dot_ptr->SetKernelArguments(3, &dotResult, sizeof(cl_mem)); + result = kernel_dot_ptr->SetKernelArguments( + 3, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -168,7 +164,8 @@ float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1) { break; } - result = dotResult.ReadData(cl_context_ref.command_queue_inst_, &cl_ret); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, sizeof(float), &cl_ret); if (!result) { break; } @@ -213,41 +210,38 @@ void sgemm_cl(bool TransA, bool TransB, const float *A, const float *B, size_t k_n_size = K * N * sizeof(float); size_t m_n_size = M * N * sizeof(float); - opencl::Buffer inputA(cl_context_ref.context_inst_, m_k_size, true, - nullptr); - - opencl::Buffer inputB(cl_context_ref.context_inst_, k_n_size, true, - nullptr); - - opencl::Buffer inOutC(cl_context_ref.context_inst_, m_n_size, true, - nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, A); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, m_k_size, A); if (!result) { break; } - result = inputB.WriteData(cl_context_ref.command_queue_inst_, B); + result = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, k_n_size, B); if (!result) { break; } - result = inOutC.WriteData(cl_context_ref.command_queue_inst_, C); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, m_n_size, C); if (!result) { break; } - result = kernel_sgemm_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_sgemm_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_sgemm_ptr->SetKernelArguments(1, &inputB, sizeof(cl_mem)); + result = kernel_sgemm_ptr->SetKernelArguments( + 1, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_sgemm_ptr->SetKernelArguments(2, &inOutC, sizeof(cl_mem)); + result = kernel_sgemm_ptr->SetKernelArguments( + 2, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -281,7 +275,8 @@ void sgemm_cl(bool TransA, bool TransB, const float *A, const float *B, break; } - result = inOutC.ReadData(cl_context_ref.command_queue_inst_, C); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, m_n_size, C); if (!result) { break; } @@ -372,14 +367,14 @@ void sscal_cl(float *X, const unsigned int N, const float alpha) { size_t x_size = N * sizeof(float); - opencl::Buffer inputX(cl_context_ref.context_inst_, x_size, false, nullptr); - - result = inputX.WriteData(cl_context_ref.command_queue_inst_, X); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, x_size, X); if (!result) { break; } - result = kernel_ptr->SetKernelArguments(0, &inputX, sizeof(cl_mem)); + result = kernel_ptr->SetKernelArguments(0, clbuffInstance.getOutBufferA(), + sizeof(cl_mem)); if (!result) { break; } @@ -398,7 +393,8 @@ void sscal_cl(float *X, const unsigned int N, const float alpha) { break; } - result = inputX.ReadData(cl_context_ref.command_queue_inst_, X); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, x_size, X); if (!result) { break; } @@ -439,30 +435,26 @@ void transpose_cl_axis(const float *in, float *res, size_t dim_size = sizeof(float) * input_batch_size * input_height * input_width * input_channels; - opencl::Buffer inputA(cl_context_ref.context_inst_, dim_size, true, - nullptr); - - opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim_size, true, - nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, in); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim_size, in); if (!result) { break; } - result = inOutRes.WriteData(cl_context_ref.command_queue_inst_, res); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim_size, res); if (!result) { break; } - result = - kernel_transpose_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_transpose_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { break; } - result = - kernel_transpose_ptr->SetKernelArguments(1, &inOutRes, sizeof(cl_mem)); + result = kernel_transpose_ptr->SetKernelArguments( + 1, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -503,7 +495,8 @@ void transpose_cl_axis(const float *in, float *res, break; } - result = inOutRes.ReadData(cl_context_ref.command_queue_inst_, res); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, dim_size, res); if (!result) { break; } diff --git a/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp b/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp index bdff42c135..5d5373eff5 100644 --- a/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp +++ b/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp @@ -39,44 +39,40 @@ void sgemv_cl(const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata, size_t dim1_size = sizeof(cl_half) * dim1; size_t dim2_size = sizeof(cl_half) * dim2; - opencl::Buffer inputA(cl_context_ref.context_inst_, - dim1 * dim2 * sizeof(cl_half), true, nullptr); - opencl::Buffer inputX(cl_context_ref.context_inst_, dim2_size, true, - nullptr); - - opencl::Buffer inOutY(cl_context_ref.context_inst_, dim1_size, true, - nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1 * dim2 * sizeof(cl_half), + matAdata); if (!result) { break; } - result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); + result = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim2_size, vecXdata); if (!result) { break; } - result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1_size, vecYdata); if (!result) { break; } - result = - kernel_sgemv_fp16_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_sgemv_fp16_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { break; } - result = - kernel_sgemv_fp16_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); + result = kernel_sgemv_fp16_ptr->SetKernelArguments( + 1, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!result) { break; } - result = - kernel_sgemv_fp16_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); + result = kernel_sgemv_fp16_ptr->SetKernelArguments( + 2, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -100,7 +96,8 @@ void sgemv_cl(const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata, break; } - result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, dim1_size, vecYdata); if (!result) { break; } @@ -125,33 +122,26 @@ __fp16 dot_cl(const __fp16 *vecAdata, const __fp16 *vecXdata, size_t dim1_size = sizeof(cl_half) * dim1; - opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true, - nullptr); - - opencl::Buffer inputX(cl_context_ref.context_inst_, dim1_size, true, - nullptr); - - opencl::Buffer dotResult(cl_context_ref.context_inst_, sizeof(__fp16), true, - &cl_ret); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, vecAdata); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1_size, vecAdata); if (!result) { break; } - result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); + result = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1_size, vecXdata); if (!result) { break; } - result = - kernel_dot_fp16_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_dot_fp16_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { break; } - result = - kernel_dot_fp16_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); + result = kernel_dot_fp16_ptr->SetKernelArguments( + 1, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!result) { break; } @@ -161,8 +151,8 @@ __fp16 dot_cl(const __fp16 *vecAdata, const __fp16 *vecXdata, break; } - result = - kernel_dot_fp16_ptr->SetKernelArguments(3, &dotResult, sizeof(cl_mem)); + result = kernel_dot_fp16_ptr->SetKernelArguments( + 3, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -176,7 +166,8 @@ __fp16 dot_cl(const __fp16 *vecAdata, const __fp16 *vecXdata, break; } - result = dotResult.ReadData(cl_context_ref.command_queue_inst_, &cl_ret); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, sizeof(cl_half), &cl_ret); if (!result) { break; } @@ -221,44 +212,38 @@ void sgemm_cl(bool TransA, bool TransB, const __fp16 *A, const __fp16 *B, size_t k_n_size = K * N * sizeof(cl_half); size_t m_n_size = M * N * sizeof(cl_half); - opencl::Buffer inputA(cl_context_ref.context_inst_, m_k_size, true, - nullptr); - - opencl::Buffer inputB(cl_context_ref.context_inst_, k_n_size, true, - nullptr); - - opencl::Buffer inOutC(cl_context_ref.context_inst_, m_n_size, true, - nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, A); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, m_k_size, A); if (!result) { break; } - result = inputB.WriteData(cl_context_ref.command_queue_inst_, B); + result = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, k_n_size, B); if (!result) { break; } - result = inOutC.WriteData(cl_context_ref.command_queue_inst_, C); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, m_n_size, C); if (!result) { break; } - result = - kernel_sgemm_fp16_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_sgemm_fp16_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { break; } - result = - kernel_sgemm_fp16_ptr->SetKernelArguments(1, &inputB, sizeof(cl_mem)); + result = kernel_sgemm_fp16_ptr->SetKernelArguments( + 1, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!result) { break; } - result = - kernel_sgemm_fp16_ptr->SetKernelArguments(2, &inOutC, sizeof(cl_mem)); + result = kernel_sgemm_fp16_ptr->SetKernelArguments( + 2, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -292,7 +277,8 @@ void sgemm_cl(bool TransA, bool TransB, const __fp16 *A, const __fp16 *B, break; } - result = inOutC.ReadData(cl_context_ref.command_queue_inst_, C); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, m_n_size, C); if (!result) { break; } @@ -384,15 +370,14 @@ void sscal_cl(__fp16 *X, const unsigned int N, const float alpha) { size_t x_size = N * sizeof(cl_half); - opencl::Buffer inputX(cl_context_ref.context_inst_, x_size, false, nullptr); - - result = inputX.WriteData(cl_context_ref.command_queue_inst_, X); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, x_size, X); if (!result) { break; } - result = - kernel_sscal_fp16_ptr->SetKernelArguments(0, &inputX, sizeof(cl_mem)); + result = kernel_sscal_fp16_ptr->SetKernelArguments( + 0, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -412,7 +397,8 @@ void sscal_cl(__fp16 *X, const unsigned int N, const float alpha) { break; } - result = inputX.ReadData(cl_context_ref.command_queue_inst_, X); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, x_size, X); if (!result) { break; } @@ -453,30 +439,26 @@ void transpose_cl_axis(const __fp16 *in, __fp16 *res, size_t dim_size = sizeof(__fp16) * input_batch_size * input_height * input_width * input_channels; - opencl::Buffer inputA(cl_context_ref.context_inst_, dim_size, true, - nullptr); - - opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim_size, true, - nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, in); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim_size, in); if (!result) { break; } - result = inOutRes.WriteData(cl_context_ref.command_queue_inst_, res); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim_size, res); if (!result) { break; } - result = kernel_transpose_fp_16_ptr->SetKernelArguments(0, &inputA, - sizeof(cl_mem)); + result = kernel_transpose_fp_16_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_transpose_fp_16_ptr->SetKernelArguments(1, &inOutRes, - sizeof(cl_mem)); + result = kernel_transpose_fp_16_ptr->SetKernelArguments( + 1, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -517,7 +499,8 @@ void transpose_cl_axis(const __fp16 *in, __fp16 *res, break; } - result = inOutRes.ReadData(cl_context_ref.command_queue_inst_, res); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, dim_size, res); if (!result) { break; }