From 76348d11857af7eeec6a374d8d2997162b34c0a9 Mon Sep 17 00:00:00 2001 From: Johannes M Dieterich Date: Tue, 19 Jun 2018 09:26:27 -0500 Subject: [PATCH 1/4] Copy hookups for clang-tidy from miopen. Not working yet. --- CMakeLists.txt | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0e78e161a..837fffa38 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -117,6 +117,63 @@ include( ROCMCreatePackage ) include( ROCMInstallTargets ) include( ROCMPackageConfigHelpers ) include( ROCMInstallSymlinks ) +include( ROCMClangTidy ) +rocm_enable_clang_tidy( + CHECKS + * + -cert-env33-c + -android-cloexec-fopen + -cert-msc50-cpp + -clang-analyzer-alpha.core.CastToStruct + -clang-analyzer-optin.performance.Padding + -clang-diagnostic-deprecated-declarations + -clang-diagnostic-extern-c-compat + -clang-diagnostic-unused-command-line-argument + -cppcoreguidelines-pro-bounds-array-to-pointer-decay + -cppcoreguidelines-pro-bounds-constant-array-index + -cppcoreguidelines-pro-bounds-pointer-arithmetic + -cppcoreguidelines-pro-type-member-init + -cppcoreguidelines-pro-type-reinterpret-cast + -cppcoreguidelines-pro-type-union-access + -cppcoreguidelines-pro-type-vararg + -cppcoreguidelines-special-member-functions + -fuchsia-* + -google-readability-braces-around-statements + -google-readability-todo + -google-runtime-int + -google-runtime-references + -hicpp-braces-around-statements + -hicpp-explicit-conversions + -hicpp-no-array-decay + -hicpp-special-member-functions + -hicpp-use-override + # This check is broken + -hicpp-use-auto + -llvm-header-guard + -llvm-include-order + -misc-macro-parentheses + -modernize-use-auto + -modernize-use-override + -modernize-pass-by-value + -modernize-use-default-member-init + -modernize-use-transparent-functors + -performance-unnecessary-value-param + -readability-braces-around-statements + -readability-else-after-return + -readability-named-parameter + -*-explicit-constructor + -*-use-emplace + -*-use-equals-default + ERRORS + * + -readability-inconsistent-declaration-parameter-name + HEADER_FILTER + ".*hpp" + EXTRA_ARGS + -DROCSOLVER_USE_CLANG_TIDY + ANALYZE_TEMPORARY_DTORS ON + +) rocm_setup_version( VERSION 0.1.0 NO_GIT_TAG_VERSION ) From 5209abb54d80977ceb5cea3d80f924b65c0c6ed9 Mon Sep 17 00:00:00 2001 From: Johannes M Dieterich Date: Tue, 19 Jun 2018 09:27:40 -0500 Subject: [PATCH 2/4] Mark rocsolver-bench as a clang-tidy target. --- README.md | 7 +- clients/gtest/getrs_gtest.cpp | 162 +++++++++++++++++++ clients/include/testing_getrs.hpp | 216 +++++++++++++++++++++++++ library/src/lapack/roclapack_getf2.cpp | 12 +- library/src/lapack/roclapack_getrf.cpp | 14 +- library/src/lapack/roclapack_getrs.cpp | 31 ++++ library/src/lapack/roclapack_getrs.hpp | 90 +++++++++++ library/src/lapack/roclapack_potf2.cpp | 12 +- 8 files changed, 521 insertions(+), 23 deletions(-) create mode 100644 clients/gtest/getrs_gtest.cpp create mode 100644 clients/include/testing_getrs.hpp create mode 100644 library/src/lapack/roclapack_getrs.cpp create mode 100644 library/src/lapack/roclapack_getrs.hpp diff --git a/README.md b/README.md index 9ea51d172..c2eec0a7e 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ CXX=/opt/rocm/bin/hcc cmake .. make ``` # Implemented functions in LAPACK notation -Cholesky decomposition: `rocsolver_spotf2() rocsolver_dpotf2()` -unblocked LU decomposition: `rocsolver_sgetf2() rocsolver_dgetf2()` -blocked LU decomposition: `rocsolver_sgetrf() rocsolver_dgetrf()` +Cholesky decomposition: `rocsolver_spotf2() rocsolver_dpotf2()` +unblocked LU decomposition: `rocsolver_sgetf2() rocsolver_dgetf2()` +blocked LU decomposition: `rocsolver_sgetrf() rocsolver_dgetrf()` +solution of system of linear equations: `rocsolver_sgetrs() rocsolver_dgetrs()` diff --git a/clients/gtest/getrs_gtest.cpp b/clients/gtest/getrs_gtest.cpp new file mode 100644 index 000000000..8289ab102 --- /dev/null +++ b/clients/gtest/getrs_gtest.cpp @@ -0,0 +1,162 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + +#include "testing_getrf.hpp" +#include "utility.h" +#include +#include +#include +#include + +using ::testing::Combine; +using ::testing::TestWithParam; +using ::testing::Values; +using ::testing::ValuesIn; +using namespace std; + +// only GCC/VS 2010 comes with std::tr1::tuple, but it is unnecessary, +// std::tuple is good enough; + +typedef std::tuple, int> getrf_tuple; + +/* ===================================================================== +README: This file contains testers to verify the correctness of + BLAS routines with google test + + It is supposed to be played/used by advance / expert users + Normal users only need to get the library routines without testers + =================================================================== */ + +/* ===================================================================== +Advance users only: BrainStorm the parameters but do not make artificial one +which invalidates the matrix. like lda pairs with M, and "lda must >= M". case +"lda < M" will be guarded by argument-checkers inside API of course. Yet, the +goal of this file is to verify result correctness not argument-checkers. + +Representative sampling is sufficient, endless brute-force sampling is not +necessary +=================================================================== */ + +// vector of vector, each vector is a {M, lda}; +// add/delete as a group +const vector> matrix_size_range = { + {-1, 1}, {10, 10}, {10, 20}, {500, 500}, {500, 750}, +}; + +// each is a N +const vector n_size_range = { + 1, 20, 40, 600, 600, +}; + +const vector> large_matrix_size_range = { + {192, 192}, {640, 640}, {1000, 1000}, {1024, 1024}, {2000, 2000}, +}; + +const vector large_n_size_range = { + 192, 640, 1000, 1024, 2000, +}; + +/* ===============Google Unit + * Test==================================================== */ + +/* ===================================================================== + LAPACK getrf: +=================================================================== */ + +/* ============================Setup + * Arguments======================================= */ + +// Please use "class Arguments" (see utility.hpp) to pass parameters to +// templated testers; Some routines may not touch/use certain "members" of +// objects "argus". like BLAS-1 Scal does not have lda, BLAS-2 GEMV does not +// have ldb, ldc; That is fine. These testers & routines will leave untouched +// members alone. Do not use std::tuple to directly pass parameters to testers +// by std:tuple, you have unpack it with extreme care for each one by like +// "std::get<0>" which is not intuitive and error-prone + +Arguments setup_getrf_arguments(getrf_tuple tup) { + + vector matrix_size = std::get<0>(tup); + int n_size = std::get<1>(tup); + + Arguments arg; + + // see the comments about matrix_size_range above + arg.M = matrix_size[0]; + arg.N = n_size; + arg.lda = matrix_size[1]; + + arg.timing = 0; + + return arg; +} + +class getrf_gtest : public ::TestWithParam { +protected: + getrf_gtest() {} + virtual ~getrf_gtest() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +TEST_P(getrf_gtest, getrf_gtest_float) { + // GetParam return a tuple. Tee setup routine unpack the tuple + // and initializes arg(Arguments) which will be passed to testing routine + // The Arguments data struture have physical meaning associated. + // while the tuple is non-intuitive. + + Arguments arg = setup_getrf_arguments(GetParam()); + + rocblas_status status = testing_getrf(arg); + + // if not success, then the input argument is problematic, so detect the error + // message + if (status != rocblas_status_success) { + + if (arg.M < 0 || arg.N < 0) { + EXPECT_EQ(rocblas_status_invalid_size, status); + } else if (arg.lda < arg.M) { + EXPECT_EQ(rocblas_status_invalid_size, status); + } + } +} + +TEST_P(getrf_gtest, getrf_gtest_double) { + // GetParam return a tuple. Tee setup routine unpack the tuple + // and initializes arg(Arguments) which will be passed to testing routine + // The Arguments data struture have physical meaning associated. + // while the tuple is non-intuitive. + + Arguments arg = setup_getrf_arguments(GetParam()); + + rocblas_status status = testing_getrf(arg); + + // if not success, then the input argument is problematic, so detect the error + // message + if (status != rocblas_status_success) { + + if (arg.M < 0 || arg.N < 0) { + EXPECT_EQ(rocblas_status_invalid_size, status); + } else if (arg.lda < arg.M) { + EXPECT_EQ(rocblas_status_invalid_size, status); + } + } +} + +// notice we are using vector of vector +// so each element in xxx_range is a vector, +// ValuesIn take each element (a vector) and combine them and feed them to +// test_p The combinations are { {M, N, lda}} + +// This function mainly test the scope of matrix_size. +INSTANTIATE_TEST_CASE_P(daily_lapack, getrf_gtest, + Combine(ValuesIn(large_matrix_size_range), + ValuesIn(large_n_size_range))); + +// THis function mainly test the scope of uplo_range, the scope of +// matrix_size_range is small +INSTANTIATE_TEST_CASE_P(checkin_lapack, getrf_gtest, + Combine(ValuesIn(matrix_size_range), + ValuesIn(n_size_range))); diff --git a/clients/include/testing_getrs.hpp b/clients/include/testing_getrs.hpp new file mode 100644 index 000000000..0954930a7 --- /dev/null +++ b/clients/include/testing_getrs.hpp @@ -0,0 +1,216 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include // std::abs +#include +#include +#include // std::numeric_limits::epsilon(); +#include +#include +#include + +#include "arg_check.h" +#include "cblas_interface.h" +#include "norm.h" +#include "rocblas_test_unique_ptr.hpp" +#include "rocsolver.hpp" +#include "unit.h" +#include "utility.h" +#ifdef GOOGLE_TEST +#include +#endif + +// this is max error PER element after the LU +#define GETRF_ERROR_EPS_MULTIPLIER 500 + +using namespace std; + +template rocblas_status testing_getrf(Arguments argus) { + + rocblas_int M = argus.M; + rocblas_int N = argus.N; + rocblas_int lda = argus.lda; + + rocblas_int safe_size = 100; // arbitrarily set to 100 + + rocblas_int size_A = max(lda, M) * N; + + rocblas_status status; + + std::unique_ptr unique_ptr_handle( + new rocblas_test::handle_struct); + rocblas_handle handle = unique_ptr_handle->handle; + + // check here to prevent undefined memory allocation error + if (M < 0 || N < 0 || lda < M) { + auto dA_managed = + rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(T) * safe_size), + rocblas_test::device_free}; + T *dA = (T *)dA_managed.get(); + if (!dA) { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return rocblas_status_memory_error; + } + + auto dIpiv_managed = + rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(int) * min(M, N)), + rocblas_test::device_free}; + rocblas_int *dIpiv = (rocblas_int *)dIpiv_managed.get(); + + status = rocsolver_getrf(handle, M, N, dA, lda, dIpiv); + + getrf_arg_check(status, M, N); + + return status; + } + + // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory + vector hA(size_A); + vector AAT(size_A); + + double gpu_time_used, cpu_time_used; + T error_eps_multiplier = GETRF_ERROR_EPS_MULTIPLIER; + T eps = std::numeric_limits::epsilon(); + + // allocate memory on device + auto dA_managed = + rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(T) * size_A), + rocblas_test::device_free}; + T *dA = (T *)dA_managed.get(); + if (!dA) { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return rocblas_status_memory_error; + } + + // initialize full random matrix hA with all entries in [1, 10] + rocblas_init(hA, M, N, lda); + + // pad untouched area into zero + for (int i = M; i < lda; i++) { + for (int j = 0; j < N; j++) { + hA[i + j * lda] = 0.0; + } + } + + // put it into [0, 1] + for (int i = M; i < lda; i++) { + for (int j = 0; j < N; j++) { + hA[i + j * lda] = (hA[i + j * lda] - 1.0) / 10.0; + } + } + + // now make it diagonally dominant + for (int i = 0; i < min(M, N); i++) { + hA[i + i * lda] *= 420.0; + } + + // copy data from CPU to device + CHECK_HIP_ERROR( + hipMemcpy(dA, hA.data(), sizeof(T) * size_A, hipMemcpyHostToDevice)); + + // allocate space for the pivoting array + vector hIpiv(min(M, N)); + auto dIpiv_managed = + rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(int) * min(M, N)), + rocblas_test::device_free}; + rocblas_int *dIpiv = (rocblas_int *)dIpiv_managed.get(); + + T max_err_1 = 0.0; + if (argus.unit_check || argus.norm_check) { + // calculate dXorB <- A^(-1) B rocblas_pointer_mode_host + const rocblas_status retGPU = + rocsolver_getrf(handle, M, N, dA, lda, dIpiv); + + CHECK_HIP_ERROR( + hipMemcpy(AAT.data(), dA, sizeof(T) * size_A, hipMemcpyDeviceToHost)); + + const int retCBLAS = cblas_getrf(M, N, hA.data(), lda, hIpiv.data()); + + if (retCBLAS != 0) { + // error encountered - we expect the same to happen from the GPU! + if (retGPU == rocblas_status_success) { + fprintf(stderr, "rocBLAS should fail also but doesn't!"); + return rocblas_status_internal_error; + } + } else { + CHECK_ROCBLAS_ERROR(retGPU); + + // Error Check + + // check if the pivoting returned is identical + vector hIpivGPU(min(M, N)); + CHECK_HIP_ERROR(hipMemcpy(hIpivGPU.data(), dIpiv, sizeof(int) * min(M, N), + hipMemcpyDeviceToHost)); + for (int j = 0; j < min(M, N); j++) { + const int refPiv = hIpiv[j]; + const int gpuPiv = hIpivGPU[j]; + if (refPiv != gpuPiv) { + cerr << "reference pivot " << j << ": " << refPiv << " vs " << gpuPiv + << endl; + return rocblas_status_internal_error; + } + } + + // AAT contains calculated decomposition, so error is hA - AAT + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + AAT[i + j * lda] = abs(AAT[i + j * lda] - hA[i + j * lda]); + } + } + + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + max_err_1 = + max_err_1 > AAT[i + j * lda] ? max_err_1 : AAT[i + j * lda]; + } + } + getrf_err_res_check(max_err_1, M, N, error_eps_multiplier, eps); + } + } + + if (argus.timing) { + // GPU rocBLAS + gpu_time_used = get_time_us(); // in microseconds + + const rocblas_status retGPU = + rocsolver_getrf(handle, M, N, dA, lda, dIpiv); + + gpu_time_used = get_time_us() - gpu_time_used; + + // CPU cblas + cpu_time_used = get_time_us(); + + const int retCBLAS = cblas_getrf(M, N, hA.data(), lda, hIpiv.data()); + + if (retCBLAS != 0) { + // error encountered - we expect the same to happen from the GPU! + if (retGPU == rocblas_status_success) { + fprintf(stderr, "rocBLAS should fail also but doesn't!"); + } + } else { + CHECK_ROCBLAS_ERROR(retGPU); + } + + cpu_time_used = get_time_us() - cpu_time_used; + + // only norm_check return an norm error, unit check won't return anything + cout << "M , N , lda , us [gpu] , us [cpu]"; + + if (argus.norm_check) + cout << ",norm_error_host_ptr"; + + cout << endl; + + cout << M << " , " << N << " , " << lda << " , " << gpu_time_used << " , " + << cpu_time_used; + + if (argus.norm_check) + cout << " , " << max_err_1; + + cout << endl; + } + return rocblas_status_success; +} + +#undef GETRF_ERROR_EPS_MULTIPLIER diff --git a/library/src/lapack/roclapack_getf2.cpp b/library/src/lapack/roclapack_getf2.cpp index 4c415b032..3011c9cfe 100644 --- a/library/src/lapack/roclapack_getf2.cpp +++ b/library/src/lapack/roclapack_getf2.cpp @@ -14,14 +14,14 @@ * =========================================================================== */ -extern "C" rocblas_status rocsolver_sgetf2(rocblas_handle handle, rocblas_int m, - rocblas_int n, float *A, - rocblas_int lda, rocblas_int *ipiv) { +extern "C" ROCSOLVER_EXPORT rocblas_status +rocsolver_sgetf2(rocblas_handle handle, rocblas_int m, rocblas_int n, float *A, + rocblas_int lda, rocblas_int *ipiv) { return rocsolver_getf2_template(handle, m, n, A, lda, ipiv); } -extern "C" rocblas_status rocsolver_dgetf2(rocblas_handle handle, rocblas_int m, - rocblas_int n, double *A, - rocblas_int lda, rocblas_int *ipiv) { +extern "C" ROCSOLVER_EXPORT rocblas_status +rocsolver_dgetf2(rocblas_handle handle, rocblas_int m, rocblas_int n, double *A, + rocblas_int lda, rocblas_int *ipiv) { return rocsolver_getf2_template(handle, m, n, A, lda, ipiv); } \ No newline at end of file diff --git a/library/src/lapack/roclapack_getrf.cpp b/library/src/lapack/roclapack_getrf.cpp index 24c6f3f79..2c6e817a5 100644 --- a/library/src/lapack/roclapack_getrf.cpp +++ b/library/src/lapack/roclapack_getrf.cpp @@ -14,16 +14,14 @@ * =========================================================================== */ -extern "C" rocblas_status rocsolver_sgetrf(rocsolver_handle handle, - rocsolver_int m, rocsolver_int n, - float *A, rocsolver_int lda, - rocsolver_int *ipiv) { +extern "C" ROCSOLVER_EXPORT rocblas_status +rocsolver_sgetrf(rocsolver_handle handle, rocsolver_int m, rocsolver_int n, + float *A, rocsolver_int lda, rocsolver_int *ipiv) { return rocsolver_getrf_template(handle, m, n, A, lda, ipiv); } -extern "C" rocblas_status rocsolver_dgetrf(rocsolver_handle handle, - rocsolver_int m, rocsolver_int n, - double *A, rocsolver_int lda, - rocsolver_int *ipiv) { +extern "C" ROCSOLVER_EXPORT rocblas_status +rocsolver_dgetrf(rocsolver_handle handle, rocsolver_int m, rocsolver_int n, + double *A, rocsolver_int lda, rocsolver_int *ipiv) { return rocsolver_getrf_template(handle, m, n, A, lda, ipiv); } \ No newline at end of file diff --git a/library/src/lapack/roclapack_getrs.cpp b/library/src/lapack/roclapack_getrs.cpp new file mode 100644 index 000000000..921716627 --- /dev/null +++ b/library/src/lapack/roclapack_getrs.cpp @@ -0,0 +1,31 @@ +/* ************************************************************************ + * Derived from the BSD2-licensed + * LAPACK routine (version 3.1) -- + * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. + * November 2006 + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "roclapack_getrs.hpp" + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" ROCSOLVER_EXPORT rocblas_status +rocsolver_sgetrs(rocblas_handle handle, rocblas_operation trans, rocblas_int n, + rocblas_int nrhs, const float *A, rocblas_int lda, + const rocblas_int *ipiv, float *B, rocblas_int ldb) { + return rocsolver_getrs_template(handle, trans, n, nrhs, A, lda, ipiv, + B, ldb); +} + +extern "C" ROCSOLVER_EXPORT rocblas_status +rocsolver_dgetrs(rocblas_handle handle, rocblas_operation trans, rocblas_int n, + rocblas_int nrhs, const double *A, rocblas_int lda, + const rocblas_int *ipiv, double *B, rocblas_int ldb) { + return rocsolver_getrs_template(handle, trans, n, nrhs, A, lda, ipiv, + B, ldb); +} diff --git a/library/src/lapack/roclapack_getrs.hpp b/library/src/lapack/roclapack_getrs.hpp new file mode 100644 index 000000000..ef1c6e070 --- /dev/null +++ b/library/src/lapack/roclapack_getrs.hpp @@ -0,0 +1,90 @@ +/* ************************************************************************ + * Derived from the BSD2-licensed + * LAPACK routine (version 3.1) -- + * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. + * November 2006 + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#ifndef ROCLAPACK_GETRS_HPP +#define ROCLAPACK_GETRS_HPP + +#include "rocsolver-export.h" +#include +#include + +#include "ideal_sizes.hpp" +#include "roclapack_laswp.hpp" + +#define GETRS_INPONE 0 + +template +rocblas_status +rocsolver_getrs_template(rocblas_handle handle, rocblas_operation trans, + rocblas_int n, rocblas_int nrhs, const T *A, + rocblas_int lda, const rocblas_int *ipiv, T *B, + rocblas_int ldb) { + + // TODO remove const_cast here once rocBLAS is released with the correct API + + // check for possible input problems + if (n < 0 || nrhs < 0 || lda < max(1, n) || ldb < max(1, n)) { + cout << "Invalid size " << n << " " << nrhs << " " << lda << " " << ldb + << endl; + return rocblas_status_invalid_size; + } + + // quick return + if (n == 0 || nrhs == 0) { + return rocblas_status_success; + } + + T inpsResHost[2]; + inpsResHost[GETRS_INPONE] = static_cast(1); + + // allocate a tiny bit of memory on device to avoid going onto CPU and needing + // to synchronize. + T *inpsResGPU; + hipMalloc(&inpsResGPU, sizeof(T)); + hipMemcpy(inpsResGPU, &inpsResHost[0], sizeof(T), hipMemcpyHostToDevice); + + if (trans == rocblas_operation_none) { + + // solve A * X = B + // first apply row interchanges to the right hand sides + roclapack_laswp_template(handle, nrhs, B, ldb, 1, n, ipiv, 1); + + // solve L*X - B, overwriting B with X + rocblas_trsm(handle, rocblas_side_left, rocblas_fill_lower, + rocblas_operation_none, rocblas_diagonal_unit, n, nrhs, + &inpsResGPU[GETRS_INPONE], const_cast(A), lda, B, ldb); + + // solve U*X = B, overwriting B with X + rocblas_trsm(handle, rocblas_side_left, rocblas_fill_upper, + rocblas_operation_none, rocblas_diagonal_non_unit, n, nrhs, + &inpsResGPU[GETRS_INPONE], const_cast(A), lda, B, ldb); + } else { + + // solve A**T * X = B or A**H * X = B + // solve U**T *X = B or U**H *X = B, overwriting B with X + rocblas_trsm(handle, rocblas_side_left, rocblas_fill_upper, trans, + rocblas_diagonal_non_unit, n, nrhs, + &inpsResGPU[GETRS_INPONE], const_cast(A), lda, B, ldb); + + // solve L**T *X = B, or L**H *X = B overwriting B with X + rocblas_trsm(handle, rocblas_side_left, rocblas_fill_lower, trans, + rocblas_diagonal_unit, n, nrhs, &inpsResGPU[GETRS_INPONE], + const_cast(A), lda, B, ldb); + + // apply row interchanges to the solution vectors + roclapack_laswp_template(handle, nrhs, B, ldb, 1, n, ipiv, -1); + } + + hipFree(inpsResGPU); + + return rocblas_status_success; +} + +#undef GETRS_INPONE + +#endif /* ROCLAPACK_GETRS_HPP */ diff --git a/library/src/lapack/roclapack_potf2.cpp b/library/src/lapack/roclapack_potf2.cpp index 1c1d0ce15..1aa308e3c 100644 --- a/library/src/lapack/roclapack_potf2.cpp +++ b/library/src/lapack/roclapack_potf2.cpp @@ -14,14 +14,14 @@ * =========================================================================== */ -extern "C" rocblas_status rocsolver_spotf2(rocblas_handle handle, - rocblas_fill uplo, rocblas_int n, - float *A, rocblas_int lda) { +extern "C" ROCSOLVER_EXPORT rocblas_status +rocsolver_spotf2(rocblas_handle handle, rocblas_fill uplo, rocblas_int n, + float *A, rocblas_int lda) { return rocsolver_potf2_template(handle, uplo, n, A, lda); } -extern "C" rocblas_status rocsolver_dpotf2(rocblas_handle handle, - rocblas_fill uplo, rocblas_int n, - double *A, rocblas_int lda) { +extern "C" ROCSOLVER_EXPORT rocblas_status +rocsolver_dpotf2(rocblas_handle handle, rocblas_fill uplo, rocblas_int n, + double *A, rocblas_int lda) { return rocsolver_potf2_template(handle, uplo, n, A, lda); } From cffb24a13cd1f87b41593fceca8973fd599ce6fc Mon Sep 17 00:00:00 2001 From: Johannes M Dieterich Date: Tue, 19 Jun 2018 09:28:22 -0500 Subject: [PATCH 3/4] Mark rocsolver-bench as a clang-tidy target. --- clients/benchmarks/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clients/benchmarks/CMakeLists.txt b/clients/benchmarks/CMakeLists.txt index 1836c2fe4..417075862 100755 --- a/clients/benchmarks/CMakeLists.txt +++ b/clients/benchmarks/CMakeLists.txt @@ -39,6 +39,8 @@ set( rocsolver_benchmark_common add_executable( rocsolver-bench client.cpp ${rocsolver_benchmark_common} ) target_compile_features( rocsolver-bench PRIVATE cxx_static_assert cxx_nullptr cxx_auto_type ) +rocm_clang_tidy_check(rocsolver-bench) + if( BUILD_WITH_TENSILE ) target_compile_definitions( rocsolver-bench PRIVATE BUILD_WITH_TENSILE=1 ) else() From 6b45238d86eed093f258e68d59a3039a295f0ae8 Mon Sep 17 00:00:00 2001 From: Johannes M Dieterich Date: Tue, 19 Jun 2018 09:31:58 -0500 Subject: [PATCH 4/4] Add a solver for systems of linear equations to rocSOLVER. This satisfies the sgetrs/dgetrs LAPACK API. Some trivial performance tests show a speedup of >300 compared to the reference LAPACK implementation on a Vega10 for large matrix sizes in single precision. --- clients/benchmarks/client.cpp | 8 +- clients/common/arg_check.cpp | 21 ++++ clients/common/cblas_interface.cpp | 50 +++++++++ clients/common/unit.cpp | 16 +++ clients/gtest/CMakeLists.txt | 3 + clients/gtest/getrs_gtest.cpp | 74 +++++++------ clients/include/arg_check.h | 3 + clients/include/cblas_interface.h | 5 + clients/include/rocsolver.hpp | 22 ++++ clients/include/testing_getrs.hpp | 139 ++++++++++++++++--------- clients/include/unit.h | 4 + library/include/rocsolver-functions.h | 105 +++++++++++++++++++ library/src/CMakeLists.txt | 6 +- library/src/lapack/roclapack_laswp.hpp | 24 +++-- 14 files changed, 381 insertions(+), 99 deletions(-) diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 01f18da47..3f78e677e 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -8,6 +8,7 @@ #include "testing_getf2.hpp" #include "testing_getrf.hpp" +#include "testing_getrs.hpp" #include "testing_potf2.hpp" #include "utility.h" @@ -99,7 +100,7 @@ int main(int argc, char *argv[]) { ("function,f", po::value(&function)->default_value("potf2"), - "LAPACK function to test. Options: potf2, getf2, getrf") + "LAPACK function to test. Options: potf2, getf2, getrf, getrs") ("precision,r", po::value(&precision)->default_value('s'), "Options: h,s,d,c,z") @@ -191,6 +192,11 @@ int main(int argc, char *argv[]) { testing_getrf(argus); else if (precision == 'd') testing_getrf(argus); + } else if (function == "getrs") { + if (precision == 's') + testing_getrs(argus); + else if (precision == 'd') + testing_getrs(argus); } else { printf("Invalid value for --function \n"); return -1; diff --git a/clients/common/arg_check.cpp b/clients/common/arg_check.cpp index b489e5f1a..346c87833 100644 --- a/clients/common/arg_check.cpp +++ b/clients/common/arg_check.cpp @@ -79,6 +79,27 @@ void getrf_arg_check(rocblas_status status, rocblas_int M, rocblas_int N) { #endif } +void getrs_arg_check(rocblas_status status, rocblas_int M, rocblas_int nhrs, + rocblas_int lda, rocblas_int ldb) { +#ifdef GOOGLE_TEST + if (M < 0 || nhrs < 0 || lda < std::max(1, M) || ldb < std::max(1, M)) { + ASSERT_EQ(status, rocblas_status_invalid_size); + } else { + ASSERT_EQ(status, rocblas_status_success); + } +#else + if (M < 0 || nhrs < 0 || lda < std::max(1, M) || ldb < std::max(1, M)) { + if (status != rocblas_status_invalid_size) + std::cerr << "result should be invalid size for size " << M << " and " + << nhrs << std::endl; + } else { + if (status != rocblas_status_success) + std::cerr << "result should be success for size " << M << " and " << nhrs + << std::endl; + } +#endif +} + void verify_rocblas_status_invalid_pointer(rocblas_status status, const char *message) { #ifdef GOOGLE_TEST diff --git a/clients/common/cblas_interface.cpp b/clients/common/cblas_interface.cpp index b7670fad1..1b32e116a 100644 --- a/clients/common/cblas_interface.cpp +++ b/clients/common/cblas_interface.cpp @@ -51,6 +51,16 @@ void cgetf2_(int *m, int *n, rocblas_float_complex *A, int *lda, int *ipiv, void zgetf2_(int *m, int *n, rocblas_double_complex *A, int *lda, int *ipiv, int *info); +void sgetrs_(char *trans, int *n, int *nrhs, float *A, int *lda, int *ipiv, + float *B, int *ldb, int *info); +void dgetrs_(char *trans, int *n, int *nrhs, double *A, int *lda, int *ipiv, + double *B, int *ldb, int *info); +void cgetrs_(char *trans, int *n, int *nrhs, rocblas_float_complex *A, int *lda, + int *ipiv, rocblas_float_complex *B, int *ldb, int *info); +void zgetrs_(char *trans, int *n, int *nrhs, rocblas_double_complex *A, + int *lda, int *ipiv, rocblas_double_complex *B, int *ldb, + int *info); + #ifdef __cplusplus } #endif @@ -743,3 +753,43 @@ rocblas_int cblas_potrf(char uplo, rocblas_int m, zpotrf_(&uplo, &m, A, &lda, &info); return info; } + +// getrs +template <> +rocblas_int cblas_getrs(char trans, rocblas_int n, rocblas_int nrhs, + float *A, rocblas_int lda, rocblas_int *ipiv, + float *B, rocblas_int ldb) { + rocblas_int info; + sgetrs_(&trans, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info); + return info; +} + +template <> +rocblas_int cblas_getrs(char trans, rocblas_int n, rocblas_int nrhs, + double *A, rocblas_int lda, rocblas_int *ipiv, + double *B, rocblas_int ldb) { + rocblas_int info; + dgetrs_(&trans, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info); + return info; +} + +template <> +rocblas_int +cblas_getrs(char trans, rocblas_int n, rocblas_int nrhs, + rocblas_float_complex *A, rocblas_int lda, + rocblas_int *ipiv, rocblas_float_complex *B, + rocblas_int ldb) { + rocblas_int info; + cgetrs_(&trans, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info); + return info; +} + +template <> +rocblas_int cblas_getrs( + char trans, rocblas_int n, rocblas_int nrhs, rocblas_double_complex *A, + rocblas_int lda, rocblas_int *ipiv, rocblas_double_complex *B, + rocblas_int ldb) { + rocblas_int info; + zgetrs_(&trans, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info); + return info; +} \ No newline at end of file diff --git a/clients/common/unit.cpp b/clients/common/unit.cpp index fec53fcc7..29cf0b1f8 100644 --- a/clients/common/unit.cpp +++ b/clients/common/unit.cpp @@ -182,4 +182,20 @@ void getrf_err_res_check(double max_error, rocblas_int M, rocblas_int N, #ifdef GOOGLE_TEST ASSERT_LE(max_error, forward_tolerance * eps); #endif +} + +template <> +void getrs_err_res_check(float max_error, rocblas_int M, rocblas_int nhrs, + float forward_tolerance, float eps) { +#ifdef GOOGLE_TEST + ASSERT_LE(max_error, forward_tolerance * eps); +#endif +} + +template <> +void getrs_err_res_check(double max_error, rocblas_int M, rocblas_int nhrs, + double forward_tolerance, double eps) { +#ifdef GOOGLE_TEST + ASSERT_LE(max_error, forward_tolerance * eps); +#endif } \ No newline at end of file diff --git a/clients/gtest/CMakeLists.txt b/clients/gtest/CMakeLists.txt index 105c598a0..def7c606c 100755 --- a/clients/gtest/CMakeLists.txt +++ b/clients/gtest/CMakeLists.txt @@ -35,6 +35,7 @@ find_package( Threads REQUIRED ) set(roclapack_test_source getf2_gtest.cpp getrf_gtest.cpp + getrs_gtest.cpp potf2_gtest.cpp ) @@ -54,6 +55,8 @@ set( rocsolver_benchmark_common add_executable( rocsolver-test ${roclapack_test_source} ${rocsolver_test_source} ${rocsolver_benchmark_common} ) target_compile_features( rocsolver-test PRIVATE cxx_static_assert cxx_nullptr cxx_auto_type ) +rocm_clang_tidy_check(rocsolver-test) + target_compile_definitions( rocsolver-test PRIVATE BUILD_WITH_TENSILE=0 GOOGLE_TEST ) # Internal header includes diff --git a/clients/gtest/getrs_gtest.cpp b/clients/gtest/getrs_gtest.cpp index 8289ab102..80e2ad899 100644 --- a/clients/gtest/getrs_gtest.cpp +++ b/clients/gtest/getrs_gtest.cpp @@ -3,7 +3,7 @@ * * ************************************************************************ */ -#include "testing_getrf.hpp" +#include "testing_getrs.hpp" #include "utility.h" #include #include @@ -19,7 +19,7 @@ using namespace std; // only GCC/VS 2010 comes with std::tr1::tuple, but it is unnecessary, // std::tuple is good enough; -typedef std::tuple, int> getrf_tuple; +typedef std::tuple, vector, char> getrs_tuple; /* ===================================================================== README: This file contains testers to verify the correctness of @@ -41,21 +41,27 @@ necessary // vector of vector, each vector is a {M, lda}; // add/delete as a group -const vector> matrix_size_range = { +const vector> matrix_sizeA_range = { {-1, 1}, {10, 10}, {10, 20}, {500, 500}, {500, 750}, }; -// each is a N -const vector n_size_range = { - 1, 20, 40, 600, 600, +// vector of vector, each vector is a {M, lda}; +// add/delete as a group +const vector> matrix_sizeB_range = { + {-1, 1}, {10, 10}, {10, 20}, {500, 500}, {500, 750}, +}; + +const vector> large_matrix_sizeA_range = { + {192, 192}, {640, 640}, {1000, 1000}, {1024, 1024}, {2000, 2000}, }; -const vector> large_matrix_size_range = { +const vector> large_matrix_sizeB_range = { {192, 192}, {640, 640}, {1000, 1000}, {1024, 1024}, {2000, 2000}, }; -const vector large_n_size_range = { - 192, 640, 1000, 1024, 2000, +const vector transpose = { + 'N', + 'T', }; /* ===============Google Unit @@ -76,40 +82,42 @@ const vector large_n_size_range = { // by std:tuple, you have unpack it with extreme care for each one by like // "std::get<0>" which is not intuitive and error-prone -Arguments setup_getrf_arguments(getrf_tuple tup) { +Arguments setup_getrs_arguments(getrs_tuple tup) { - vector matrix_size = std::get<0>(tup); - int n_size = std::get<1>(tup); + vector matrix_sizeA = std::get<0>(tup); + vector matrix_sizeB = std::get<1>(tup); Arguments arg; // see the comments about matrix_size_range above - arg.M = matrix_size[0]; - arg.N = n_size; - arg.lda = matrix_size[1]; + arg.M = matrix_sizeA[0]; + arg.N = matrix_sizeB[0]; + arg.lda = matrix_sizeA[1]; + arg.ldb = matrix_sizeB[1]; + arg.transA_option = std::get<2>(tup); arg.timing = 0; return arg; } -class getrf_gtest : public ::TestWithParam { +class getrs_gtest : public ::TestWithParam { protected: - getrf_gtest() {} - virtual ~getrf_gtest() {} + getrs_gtest() {} + virtual ~getrs_gtest() {} virtual void SetUp() {} virtual void TearDown() {} }; -TEST_P(getrf_gtest, getrf_gtest_float) { +TEST_P(getrs_gtest, getrs_gtest_float) { // GetParam return a tuple. Tee setup routine unpack the tuple // and initializes arg(Arguments) which will be passed to testing routine // The Arguments data struture have physical meaning associated. // while the tuple is non-intuitive. - Arguments arg = setup_getrf_arguments(GetParam()); + Arguments arg = setup_getrs_arguments(GetParam()); - rocblas_status status = testing_getrf(arg); + rocblas_status status = testing_getrs(arg); // if not success, then the input argument is problematic, so detect the error // message @@ -117,21 +125,21 @@ TEST_P(getrf_gtest, getrf_gtest_float) { if (arg.M < 0 || arg.N < 0) { EXPECT_EQ(rocblas_status_invalid_size, status); - } else if (arg.lda < arg.M) { + } else if (arg.lda < arg.M || arg.ldb < arg.M) { EXPECT_EQ(rocblas_status_invalid_size, status); } } } -TEST_P(getrf_gtest, getrf_gtest_double) { +TEST_P(getrs_gtest, getrs_gtest_double) { // GetParam return a tuple. Tee setup routine unpack the tuple // and initializes arg(Arguments) which will be passed to testing routine // The Arguments data struture have physical meaning associated. // while the tuple is non-intuitive. - Arguments arg = setup_getrf_arguments(GetParam()); + Arguments arg = setup_getrs_arguments(GetParam()); - rocblas_status status = testing_getrf(arg); + rocblas_status status = testing_getrs(arg); // if not success, then the input argument is problematic, so detect the error // message @@ -139,7 +147,7 @@ TEST_P(getrf_gtest, getrf_gtest_double) { if (arg.M < 0 || arg.N < 0) { EXPECT_EQ(rocblas_status_invalid_size, status); - } else if (arg.lda < arg.M) { + } else if (arg.lda < arg.M || arg.ldb < arg.M) { EXPECT_EQ(rocblas_status_invalid_size, status); } } @@ -151,12 +159,14 @@ TEST_P(getrf_gtest, getrf_gtest_double) { // test_p The combinations are { {M, N, lda}} // This function mainly test the scope of matrix_size. -INSTANTIATE_TEST_CASE_P(daily_lapack, getrf_gtest, - Combine(ValuesIn(large_matrix_size_range), - ValuesIn(large_n_size_range))); +INSTANTIATE_TEST_CASE_P(daily_lapack, getrs_gtest, + Combine(ValuesIn(large_matrix_sizeA_range), + ValuesIn(large_matrix_sizeB_range), + ValuesIn(transpose))); // THis function mainly test the scope of uplo_range, the scope of // matrix_size_range is small -INSTANTIATE_TEST_CASE_P(checkin_lapack, getrf_gtest, - Combine(ValuesIn(matrix_size_range), - ValuesIn(n_size_range))); +INSTANTIATE_TEST_CASE_P(checkin_lapack, getrs_gtest, + Combine(ValuesIn(matrix_sizeA_range), + ValuesIn(matrix_sizeA_range), + ValuesIn(transpose))); diff --git a/clients/include/arg_check.h b/clients/include/arg_check.h index c7af7233e..7b8911aff 100644 --- a/clients/include/arg_check.h +++ b/clients/include/arg_check.h @@ -35,6 +35,9 @@ void getf2_arg_check(rocsolver_status status, rocsolver_int M, rocsolver_int N); void getrf_arg_check(rocsolver_status status, rocsolver_int M, rocsolver_int N); +void getrs_arg_check(rocsolver_status status, rocsolver_int M, + rocsolver_int nhrs, rocblas_int lda, rocblas_int ldb); + template void verify_not_nan(T arg); template void verify_equal(T arg1, T arg2, const char *message); diff --git a/clients/include/cblas_interface.h b/clients/include/cblas_interface.h index be8a9aaa6..1359a41ce 100644 --- a/clients/include/cblas_interface.h +++ b/clients/include/cblas_interface.h @@ -105,6 +105,11 @@ template rocblas_int cblas_getrf(rocblas_int m, rocblas_int n, T *A, rocblas_int lda, rocblas_int *ipiv); +template +rocblas_int cblas_getrs(char trans, rocblas_int n, rocblas_int nrhs, T *A, + rocblas_int lda, rocblas_int *ipiv, T *B, + rocblas_int ldb); + template rocblas_int cblas_potrf(char uplo, rocblas_int m, T *A, rocblas_int lda); /* ============================================================================================ diff --git a/clients/include/rocsolver.hpp b/clients/include/rocsolver.hpp index ecbcf0e66..34e22e910 100644 --- a/clients/include/rocsolver.hpp +++ b/clients/include/rocsolver.hpp @@ -59,4 +59,26 @@ inline rocblas_status rocsolver_getrf(rocblas_handle handle, rocblas_int m, return rocsolver_dgetrf(handle, m, n, A, lda, ipiv); } +template +inline rocblas_status +rocsolver_getrs(rocblas_handle handle, rocblas_operation trans, rocblas_int n, + rocblas_int nrhs, const T *A, rocblas_int lda, + const rocblas_int *ipiv, T *B, rocblas_int ldb); + +template <> +inline rocblas_status +rocsolver_getrs(rocblas_handle handle, rocblas_operation trans, rocblas_int n, + rocblas_int nrhs, const float *A, rocblas_int lda, + const rocblas_int *ipiv, float *B, rocblas_int ldb) { + return rocsolver_sgetrs(handle, trans, n, nrhs, A, lda, ipiv, B, ldb); +} + +template <> +inline rocblas_status +rocsolver_getrs(rocblas_handle handle, rocblas_operation trans, rocblas_int n, + rocblas_int nrhs, const double *A, rocblas_int lda, + const rocblas_int *ipiv, double *B, rocblas_int ldb) { + return rocsolver_dgetrs(handle, trans, n, nrhs, A, lda, ipiv, B, ldb); +} + #endif /* ROCSOLVER_HPP */ diff --git a/clients/include/testing_getrs.hpp b/clients/include/testing_getrs.hpp index 0954930a7..efa0d0d53 100644 --- a/clients/include/testing_getrs.hpp +++ b/clients/include/testing_getrs.hpp @@ -21,20 +21,32 @@ #include #endif -// this is max error PER element after the LU +// this is max error PER element after the solution #define GETRF_ERROR_EPS_MULTIPLIER 500 using namespace std; -template rocblas_status testing_getrf(Arguments argus) { +template rocblas_status testing_getrs(Arguments argus) { rocblas_int M = argus.M; - rocblas_int N = argus.N; + rocblas_int nhrs = argus.N; rocblas_int lda = argus.lda; + rocblas_int ldb = argus.ldb; + char trans = argus.transA_option; + + rocblas_operation transRoc; + if (trans == 'N') { + transRoc = rocblas_operation_none; + } else if (trans == 'T') { + transRoc = rocblas_operation_transpose; + } else { + throw runtime_error("Unsupported transpose operation."); + } rocblas_int safe_size = 100; // arbitrarily set to 100 - rocblas_int size_A = max(lda, M) * N; + rocblas_int size_A = max(lda, M) * M; + rocblas_int size_B = max(ldb, M) * nhrs; rocblas_status status; @@ -43,7 +55,7 @@ template rocblas_status testing_getrf(Arguments argus) { rocblas_handle handle = unique_ptr_handle->handle; // check here to prevent undefined memory allocation error - if (M < 0 || N < 0 || lda < M) { + if (M < 0 || nhrs < 0 || lda < std::max(1, M) || ldb < std::max(1, M)) { auto dA_managed = rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(T) * safe_size), rocblas_test::device_free}; @@ -53,21 +65,32 @@ template rocblas_status testing_getrf(Arguments argus) { return rocblas_status_memory_error; } + auto dB_managed = + rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(T) * safe_size), + rocblas_test::device_free}; + T *dB = (T *)dB_managed.get(); + if (!dB) { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return rocblas_status_memory_error; + } + auto dIpiv_managed = - rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(int) * min(M, N)), + rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(int) * M), rocblas_test::device_free}; rocblas_int *dIpiv = (rocblas_int *)dIpiv_managed.get(); - status = rocsolver_getrf(handle, M, N, dA, lda, dIpiv); + status = + rocsolver_getrs(handle, transRoc, M, nhrs, dA, lda, dIpiv, dB, ldb); - getrf_arg_check(status, M, N); + getrs_arg_check(status, M, nhrs, lda, ldb); return status; } // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory vector hA(size_A); - vector AAT(size_A); + vector hB(size_B); + vector hBRes(size_B); double gpu_time_used, cpu_time_used; T error_eps_multiplier = GETRF_ERROR_EPS_MULTIPLIER; @@ -83,49 +106,74 @@ template rocblas_status testing_getrf(Arguments argus) { return rocblas_status_memory_error; } - // initialize full random matrix hA with all entries in [1, 10] - rocblas_init(hA, M, N, lda); + auto dB_managed = + rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(T) * size_B), + rocblas_test::device_free}; + T *dB = (T *)dB_managed.get(); + if (!dB) { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return rocblas_status_memory_error; + } + + // initialize full random matrix hA, hB with all entries in [1, 10] + rocblas_init(hA, M, M, lda); + rocblas_init(hB, M, nhrs, ldb); // pad untouched area into zero for (int i = M; i < lda; i++) { - for (int j = 0; j < N; j++) { + for (int j = 0; j < M; j++) { hA[i + j * lda] = 0.0; } } + for (int i = M; i < ldb; i++) { + for (int j = 0; j < nhrs; j++) { + hB[i + j * ldb] = 0.0; + } + } // put it into [0, 1] for (int i = M; i < lda; i++) { - for (int j = 0; j < N; j++) { + for (int j = 0; j < M; j++) { hA[i + j * lda] = (hA[i + j * lda] - 1.0) / 10.0; } } // now make it diagonally dominant - for (int i = 0; i < min(M, N); i++) { + for (int i = 0; i < M; i++) { hA[i + i * lda] *= 420.0; } - // copy data from CPU to device - CHECK_HIP_ERROR( - hipMemcpy(dA, hA.data(), sizeof(T) * size_A, hipMemcpyHostToDevice)); - // allocate space for the pivoting array - vector hIpiv(min(M, N)); - auto dIpiv_managed = - rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(int) * min(M, N)), - rocblas_test::device_free}; + vector hIpiv(M); + auto dIpiv_managed = rocblas_unique_ptr{ + rocblas_test::device_malloc(sizeof(int) * M), rocblas_test::device_free}; rocblas_int *dIpiv = (rocblas_int *)dIpiv_managed.get(); + // do the LU decomposition of matrix A w/ the reference LAPACK routine + const int retCBLAS = cblas_getrf(M, M, hA.data(), lda, hIpiv.data()); + if (retCBLAS != 0) { + // error encountered - unlucky pick of random numbers? no use to continue + return rocblas_status_success; + } + + // now copy pivoting indices and matrices to the GPU + CHECK_HIP_ERROR( + hipMemcpy(dA, hA.data(), sizeof(T) * size_A, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR( + hipMemcpy(dB, hB.data(), sizeof(T) * size_B, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR( + hipMemcpy(dIpiv, hIpiv.data(), sizeof(int) * M, hipMemcpyHostToDevice)); + T max_err_1 = 0.0; if (argus.unit_check || argus.norm_check) { - // calculate dXorB <- A^(-1) B rocblas_pointer_mode_host const rocblas_status retGPU = - rocsolver_getrf(handle, M, N, dA, lda, dIpiv); + rocsolver_getrs(handle, transRoc, M, nhrs, dA, lda, dIpiv, dB, ldb); CHECK_HIP_ERROR( - hipMemcpy(AAT.data(), dA, sizeof(T) * size_A, hipMemcpyDeviceToHost)); + hipMemcpy(hBRes.data(), dB, sizeof(T) * size_B, hipMemcpyDeviceToHost)); - const int retCBLAS = cblas_getrf(M, N, hA.data(), lda, hIpiv.data()); + const int retCBLAS = cblas_getrs(trans, M, nhrs, hA.data(), lda, + hIpiv.data(), hB.data(), ldb); if (retCBLAS != 0) { // error encountered - we expect the same to happen from the GPU! @@ -138,34 +186,20 @@ template rocblas_status testing_getrf(Arguments argus) { // Error Check - // check if the pivoting returned is identical - vector hIpivGPU(min(M, N)); - CHECK_HIP_ERROR(hipMemcpy(hIpivGPU.data(), dIpiv, sizeof(int) * min(M, N), - hipMemcpyDeviceToHost)); - for (int j = 0; j < min(M, N); j++) { - const int refPiv = hIpiv[j]; - const int gpuPiv = hIpivGPU[j]; - if (refPiv != gpuPiv) { - cerr << "reference pivot " << j << ": " << refPiv << " vs " << gpuPiv - << endl; - return rocblas_status_internal_error; - } - } - - // AAT contains calculated decomposition, so error is hA - AAT + // hBRes contains calculated decomposition, so error is hBres - hB for (int i = 0; i < M; i++) { - for (int j = 0; j < N; j++) { - AAT[i + j * lda] = abs(AAT[i + j * lda] - hA[i + j * lda]); + for (int j = 0; j < nhrs; j++) { + hBRes[i + j * ldb] = abs(hBRes[i + j * ldb] - hB[i + j * ldb]); } } for (int i = 0; i < M; i++) { - for (int j = 0; j < N; j++) { + for (int j = 0; j < nhrs; j++) { max_err_1 = - max_err_1 > AAT[i + j * lda] ? max_err_1 : AAT[i + j * lda]; + max_err_1 > hBRes[i + j * ldb] ? max_err_1 : hBRes[i + j * ldb]; } } - getrf_err_res_check(max_err_1, M, N, error_eps_multiplier, eps); + getrs_err_res_check(max_err_1, M, nhrs, error_eps_multiplier, eps); } } @@ -174,14 +208,15 @@ template rocblas_status testing_getrf(Arguments argus) { gpu_time_used = get_time_us(); // in microseconds const rocblas_status retGPU = - rocsolver_getrf(handle, M, N, dA, lda, dIpiv); + rocsolver_getrs(handle, transRoc, M, nhrs, dA, lda, dIpiv, dB, ldb); gpu_time_used = get_time_us() - gpu_time_used; // CPU cblas cpu_time_used = get_time_us(); - const int retCBLAS = cblas_getrf(M, N, hA.data(), lda, hIpiv.data()); + const int retCBLAS = cblas_getrs(trans, M, nhrs, hA.data(), lda, + hIpiv.data(), hB.data(), ldb); if (retCBLAS != 0) { // error encountered - we expect the same to happen from the GPU! @@ -195,15 +230,15 @@ template rocblas_status testing_getrf(Arguments argus) { cpu_time_used = get_time_us() - cpu_time_used; // only norm_check return an norm error, unit check won't return anything - cout << "M , N , lda , us [gpu] , us [cpu]"; + cout << "M , nhrs , lda , ldb , us [gpu] , us [cpu]"; if (argus.norm_check) - cout << ",norm_error_host_ptr"; + cout << ", norm_error_host_ptr"; cout << endl; - cout << M << " , " << N << " , " << lda << " , " << gpu_time_used << " , " - << cpu_time_used; + cout << M << " , " << nhrs << " , " << lda << " , " << ldb << " , " + << gpu_time_used << " , " << cpu_time_used; if (argus.norm_check) cout << " , " << max_err_1; diff --git a/clients/include/unit.h b/clients/include/unit.h index dc584d79f..fa429d6e1 100644 --- a/clients/include/unit.h +++ b/clients/include/unit.h @@ -45,6 +45,10 @@ template void getf2_err_res_check(T max_error, rocblas_int M, rocblas_int N, T forward_tolerance, T eps); +template +void getrs_err_res_check(T max_error, rocblas_int M, rocblas_int nhrs, + T forward_tolerance, T eps); + template void getrf_err_res_check(T max_error, rocblas_int M, rocblas_int N, T forward_tolerance, T eps); diff --git a/library/include/rocsolver-functions.h b/library/include/rocsolver-functions.h index 185f21c42..3c904b044 100644 --- a/library/include/rocsolver-functions.h +++ b/library/include/rocsolver-functions.h @@ -263,6 +263,111 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf(rocsolver_handle handle, rocsolver_int lda, rocsolver_int *ipiv); +/*! \brief LAPACK API + + \details + getrs solves a system of linear equations + A * X = B, A**T * X = B, or A**H * X = B + with a general N-by-N matrix A using the LU factorization computed + by getrf. + + @param[in] + trans + Specifies the form of the system of equations: + = 'N': A * X = B (No transpose) + = 'T': A**T * X = B (Transpose) + = 'C': A**H * X = B (Conjugate transpose) + + @param[in] + n + The order of the matrix A. N >= 0. + + @param[in] + nrhs + The number of right hand sides, i.e., the number of columns + of the matrix B. nrhs >= 0. + + @param[in] + A + pointer storing matrix A on the GPU. + + @param[in] + lda + The leading dimension of the array A. lda >= max(1,n). + + @param[in] + ipiv + The pivot indices from getrf; for 1<=i<=n, row i of the + matrix was interchanged with row ipiv(i). Assumes one-based indices! + + @param[in,out] + B + pointer storing matrix B on the GPU., dimension (ldb,nrhs) + On entry, the right hand side matrix B. + On exit, the solution matrix X. + + @param[in] + ldb + The leading dimension of the array B. ldb >= max(1,n). + + ********************************************************************/ +ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrs( + rocsolver_handle handle, rocsolver_operation trans, rocsolver_int n, + rocsolver_int nrhs, const float *A, rocsolver_int lda, + const rocsolver_int *ipiv, float *B, rocsolver_int ldb); + +/*! \brief LAPACK API + + \details + getrs solves a system of linear equations + A * X = B, A**T * X = B, or A**H * X = B + with a general N-by-N matrix A using the LU factorization computed + by getrf. + + @param[in] + trans + Specifies the form of the system of equations: + = 'N': A * X = B (No transpose) + = 'T': A**T * X = B (Transpose) + = 'C': A**H * X = B (Conjugate transpose) + + @param[in] + n + The order of the matrix A. N >= 0. + + @param[in] + nrhs + The number of right hand sides, i.e., the number of columns + of the matrix B. nrhs >= 0. + + @param[in] + A + pointer storing matrix A on the GPU. + + @param[in] + lda + The leading dimension of the array A. lda >= max(1,n). + + @param[in] + ipiv + The pivot indices from getrf; for 1<=i<=n, row i of the + matrix was interchanged with row ipiv(i). Assumes one-based indices! + + @param[in,out] + B + pointer storing matrix B on the GPU., dimension (ldb,nrhs) + On entry, the right hand side matrix B. + On exit, the solution matrix X. + + @param[in] + ldb + The leading dimension of the array B. ldb >= max(1,n). + + ********************************************************************/ +ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrs( + rocsolver_handle handle, rocsolver_operation trans, rocsolver_int n, + rocsolver_int nrhs, const double *A, rocsolver_int lda, + const rocsolver_int *ipiv, double *B, rocsolver_int ldb); #ifdef __cplusplus } #endif diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index 4469734e3..b6c2c4a78 100755 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -33,6 +33,7 @@ set( rocsolver_lapack_source lapack/rocblas.cpp lapack/roclapack_getf2.cpp lapack/roclapack_getrf.cpp + lapack/roclapack_getrs.cpp lapack/roclapack_potf2.cpp ) @@ -40,15 +41,14 @@ prepend_path( ".." rocsolver_headers_public relative_rocsolver_headers_public ) add_library( rocsolver ${rocsolver_lapack_source} - ${rocsolver_blas3_source} - ${rocsolver_blas2_source} - ${rocsolver_blas1_source} ${relative_rocsolver_headers_public} ${rocsolver_auxiliary_source} ) add_library( roc::rocsolver ALIAS rocsolver ) +rocm_clang_tidy_check(rocsolver) + target_link_libraries( rocsolver PRIVATE hip::hip_hcc hip::hip_device hcc::hccshared ) target_link_libraries( rocsolver PRIVATE /opt/rocm/rocblas/lib/librocblas.so ) #${ROCBLAS_LIBRARY}) diff --git a/library/src/lapack/roclapack_laswp.hpp b/library/src/lapack/roclapack_laswp.hpp index bcc551c5f..e7b0fedd8 100644 --- a/library/src/lapack/roclapack_laswp.hpp +++ b/library/src/lapack/roclapack_laswp.hpp @@ -15,10 +15,11 @@ using namespace std; template -__global__ void laswp_external(rocblas_int n, T *a, rocblas_int lda, - rocblas_int exch1, rocblas_int *exch2) { +__global__ void laswp_external(const rocblas_int n, T *a, const rocblas_int lda, + const rocblas_int exch1, + const rocblas_int *exch2) { - int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + const int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; if (tid < n) { const T orig = a[exch1 + lda * tid]; a[exch1 + lda * tid] = a[(*exch2 - 1) + lda * tid]; @@ -74,7 +75,7 @@ __global__ void laswp_external(rocblas_int n, T *a, rocblas_int lda, template void roclapack_laswp_template(rocblas_handle handle, rocblas_int n, T *A, rocblas_int lda, rocblas_int k1, rocblas_int k2, - rocblas_int *ipiv, rocblas_int incx) { + const rocblas_int *ipiv, rocblas_int incx) { if (n == 0) { // quick return @@ -83,8 +84,8 @@ void roclapack_laswp_template(rocblas_handle handle, rocblas_int n, T *A, rocblas_int start, end; if (incx < 0) { - start = k2; - end = k1; + start = k2 - 1; + end = k1 - 1; } else { start = k1; end = k2; @@ -99,13 +100,14 @@ void roclapack_laswp_template(rocblas_handle handle, rocblas_int n, T *A, * of ipiv to the host and check there. this causes the current limitation * that incx == 1 */ - if (incx != 1) { + if (incx != 1 && incx != -1) { throw runtime_error("roclapack_laswp increment must be one."); } - vector ipivHost(end - start); - hipMemcpy(ipivHost.data(), &ipiv[start], sizeof(rocblas_int) * (end - start), - hipMemcpyDeviceToHost); + vector ipivHost(abs(end - start)); + size_t startCpy = (incx < 0) ? end : start; + hipMemcpy(ipivHost.data(), &ipiv[startCpy], + sizeof(rocblas_int) * abs(end - start), hipMemcpyDeviceToHost); rocblas_int blocksPivot = (n - 1) / LASWP_BLOCKSIZE + 1; dim3 gridPivot(blocksPivot, 1, 1); @@ -116,7 +118,7 @@ void roclapack_laswp_template(rocblas_handle handle, rocblas_int n, T *A, for (rocblas_int i = start; i != end; i += incx) { - if (ipivHost[i - start] == i + 1) + if (ipivHost[i - startCpy] == i + 1) continue; hipLaunchKernelGGL(laswp_external, gridPivot, threads, 0, stream, n, A,