From 76348d11857af7eeec6a374d8d2997162b34c0a9 Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <Johannes.Dieterich@amd.com>
Date: Tue, 19 Jun 2018 09:26:27 -0500
Subject: [PATCH 1/4] Copy hookups for clang-tidy from miopen. Not working yet.

---
 CMakeLists.txt | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0e78e161a..837fffa38 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -117,6 +117,63 @@ include( ROCMCreatePackage )
 include( ROCMInstallTargets )
 include( ROCMPackageConfigHelpers )
 include( ROCMInstallSymlinks )
+include( ROCMClangTidy )
+rocm_enable_clang_tidy(
+    CHECKS
+        *
+        -cert-env33-c
+        -android-cloexec-fopen
+        -cert-msc50-cpp
+        -clang-analyzer-alpha.core.CastToStruct
+        -clang-analyzer-optin.performance.Padding
+        -clang-diagnostic-deprecated-declarations
+        -clang-diagnostic-extern-c-compat
+        -clang-diagnostic-unused-command-line-argument
+        -cppcoreguidelines-pro-bounds-array-to-pointer-decay
+        -cppcoreguidelines-pro-bounds-constant-array-index
+        -cppcoreguidelines-pro-bounds-pointer-arithmetic
+        -cppcoreguidelines-pro-type-member-init
+        -cppcoreguidelines-pro-type-reinterpret-cast
+        -cppcoreguidelines-pro-type-union-access
+        -cppcoreguidelines-pro-type-vararg
+        -cppcoreguidelines-special-member-functions
+        -fuchsia-*
+        -google-readability-braces-around-statements
+        -google-readability-todo
+        -google-runtime-int
+        -google-runtime-references
+        -hicpp-braces-around-statements
+        -hicpp-explicit-conversions
+        -hicpp-no-array-decay
+        -hicpp-special-member-functions
+        -hicpp-use-override
+        # This check is broken
+        -hicpp-use-auto
+        -llvm-header-guard
+        -llvm-include-order
+        -misc-macro-parentheses
+        -modernize-use-auto
+        -modernize-use-override
+        -modernize-pass-by-value
+        -modernize-use-default-member-init
+        -modernize-use-transparent-functors
+        -performance-unnecessary-value-param
+        -readability-braces-around-statements
+        -readability-else-after-return
+        -readability-named-parameter
+        -*-explicit-constructor
+        -*-use-emplace
+        -*-use-equals-default
+    ERRORS
+        *
+        -readability-inconsistent-declaration-parameter-name
+    HEADER_FILTER
+        ".*hpp"
+    EXTRA_ARGS
+        -DROCSOLVER_USE_CLANG_TIDY
+    ANALYZE_TEMPORARY_DTORS ON
+
+)
 
 rocm_setup_version( VERSION 0.1.0 NO_GIT_TAG_VERSION )
 

From 5209abb54d80977ceb5cea3d80f924b65c0c6ed9 Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <Johannes.Dieterich@amd.com>
Date: Tue, 19 Jun 2018 09:27:40 -0500
Subject: [PATCH 2/4] Mark rocsolver-bench as a clang-tidy target.

---
 README.md                              |   7 +-
 clients/gtest/getrs_gtest.cpp          | 162 +++++++++++++++++++
 clients/include/testing_getrs.hpp      | 216 +++++++++++++++++++++++++
 library/src/lapack/roclapack_getf2.cpp |  12 +-
 library/src/lapack/roclapack_getrf.cpp |  14 +-
 library/src/lapack/roclapack_getrs.cpp |  31 ++++
 library/src/lapack/roclapack_getrs.hpp |  90 +++++++++++
 library/src/lapack/roclapack_potf2.cpp |  12 +-
 8 files changed, 521 insertions(+), 23 deletions(-)
 create mode 100644 clients/gtest/getrs_gtest.cpp
 create mode 100644 clients/include/testing_getrs.hpp
 create mode 100644 library/src/lapack/roclapack_getrs.cpp
 create mode 100644 library/src/lapack/roclapack_getrs.hpp

diff --git a/README.md b/README.md
index 9ea51d172..c2eec0a7e 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@ CXX=/opt/rocm/bin/hcc cmake ..
 make
 ```
 # Implemented functions in LAPACK notation
-Cholesky decomposition: `rocsolver_spotf2() rocsolver_dpotf2()`
-unblocked LU decomposition: `rocsolver_sgetf2() rocsolver_dgetf2()`
-blocked LU decomposition: `rocsolver_sgetrf() rocsolver_dgetrf()`
+Cholesky decomposition: `rocsolver_spotf2() rocsolver_dpotf2()`  
+unblocked LU decomposition: `rocsolver_sgetf2() rocsolver_dgetf2()`  
+blocked LU decomposition: `rocsolver_sgetrf() rocsolver_dgetrf()`  
+solution of system of linear equations: `rocsolver_sgetrs() rocsolver_dgetrs()`  
diff --git a/clients/gtest/getrs_gtest.cpp b/clients/gtest/getrs_gtest.cpp
new file mode 100644
index 000000000..8289ab102
--- /dev/null
+++ b/clients/gtest/getrs_gtest.cpp
@@ -0,0 +1,162 @@
+/* ************************************************************************
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * ************************************************************************ */
+
+#include "testing_getrf.hpp"
+#include "utility.h"
+#include <gtest/gtest.h>
+#include <math.h>
+#include <stdexcept>
+#include <vector>
+
+using ::testing::Combine;
+using ::testing::TestWithParam;
+using ::testing::Values;
+using ::testing::ValuesIn;
+using namespace std;
+
+// only GCC/VS 2010 comes with std::tr1::tuple, but it is unnecessary,
+// std::tuple is good enough;
+
+typedef std::tuple<vector<int>, int> getrf_tuple;
+
+/* =====================================================================
+README: This file contains testers to verify the correctness of
+        BLAS routines with google test
+
+        It is supposed to be played/used by advance / expert users
+        Normal users only need to get the library routines without testers
+     =================================================================== */
+
+/* =====================================================================
+Advance users only: BrainStorm the parameters but do not make artificial one
+which invalidates the matrix. like lda pairs with M, and "lda must >= M". case
+"lda < M" will be guarded by argument-checkers inside API of course. Yet, the
+goal of this file is to verify result correctness not argument-checkers.
+
+Representative sampling is sufficient, endless brute-force sampling is not
+necessary
+=================================================================== */
+
+// vector of vector, each vector is a {M, lda};
+// add/delete as a group
+const vector<vector<int>> matrix_size_range = {
+    {-1, 1}, {10, 10}, {10, 20}, {500, 500}, {500, 750},
+};
+
+// each is a N
+const vector<int> n_size_range = {
+    1, 20, 40, 600, 600,
+};
+
+const vector<vector<int>> large_matrix_size_range = {
+    {192, 192}, {640, 640}, {1000, 1000}, {1024, 1024}, {2000, 2000},
+};
+
+const vector<int> large_n_size_range = {
+    192, 640, 1000, 1024, 2000,
+};
+
+/* ===============Google Unit
+ * Test==================================================== */
+
+/* =====================================================================
+     LAPACK getrf:
+=================================================================== */
+
+/* ============================Setup
+ * Arguments======================================= */
+
+// Please use "class Arguments" (see utility.hpp) to pass parameters to
+// templated testers; Some routines may not touch/use certain "members" of
+// objects "argus". like BLAS-1 Scal does not have lda, BLAS-2 GEMV does not
+// have ldb, ldc; That is fine. These testers & routines will leave untouched
+// members alone. Do not use std::tuple to directly pass parameters to testers
+// by std:tuple, you have unpack it with extreme care for each one by like
+// "std::get<0>" which is not intuitive and error-prone
+
+Arguments setup_getrf_arguments(getrf_tuple tup) {
+
+  vector<int> matrix_size = std::get<0>(tup);
+  int n_size = std::get<1>(tup);
+
+  Arguments arg;
+
+  // see the comments about matrix_size_range above
+  arg.M = matrix_size[0];
+  arg.N = n_size;
+  arg.lda = matrix_size[1];
+
+  arg.timing = 0;
+
+  return arg;
+}
+
+class getrf_gtest : public ::TestWithParam<getrf_tuple> {
+protected:
+  getrf_gtest() {}
+  virtual ~getrf_gtest() {}
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+};
+
+TEST_P(getrf_gtest, getrf_gtest_float) {
+  // GetParam return a tuple. Tee setup routine unpack the tuple
+  // and initializes arg(Arguments) which will be passed to testing routine
+  // The Arguments data struture have physical meaning associated.
+  // while the tuple is non-intuitive.
+
+  Arguments arg = setup_getrf_arguments(GetParam());
+
+  rocblas_status status = testing_getrf<float>(arg);
+
+  // if not success, then the input argument is problematic, so detect the error
+  // message
+  if (status != rocblas_status_success) {
+
+    if (arg.M < 0 || arg.N < 0) {
+      EXPECT_EQ(rocblas_status_invalid_size, status);
+    } else if (arg.lda < arg.M) {
+      EXPECT_EQ(rocblas_status_invalid_size, status);
+    }
+  }
+}
+
+TEST_P(getrf_gtest, getrf_gtest_double) {
+  // GetParam return a tuple. Tee setup routine unpack the tuple
+  // and initializes arg(Arguments) which will be passed to testing routine
+  // The Arguments data struture have physical meaning associated.
+  // while the tuple is non-intuitive.
+
+  Arguments arg = setup_getrf_arguments(GetParam());
+
+  rocblas_status status = testing_getrf<double>(arg);
+
+  // if not success, then the input argument is problematic, so detect the error
+  // message
+  if (status != rocblas_status_success) {
+
+    if (arg.M < 0 || arg.N < 0) {
+      EXPECT_EQ(rocblas_status_invalid_size, status);
+    } else if (arg.lda < arg.M) {
+      EXPECT_EQ(rocblas_status_invalid_size, status);
+    }
+  }
+}
+
+// notice we are using vector of vector
+// so each element in xxx_range is a vector,
+// ValuesIn take each element (a vector) and combine them and feed them to
+// test_p The combinations are  { {M, N, lda}}
+
+// This function mainly test the scope of matrix_size.
+INSTANTIATE_TEST_CASE_P(daily_lapack, getrf_gtest,
+                        Combine(ValuesIn(large_matrix_size_range),
+                                ValuesIn(large_n_size_range)));
+
+// THis function mainly test the scope of uplo_range, the scope of
+// matrix_size_range is small
+INSTANTIATE_TEST_CASE_P(checkin_lapack, getrf_gtest,
+                        Combine(ValuesIn(matrix_size_range),
+                                ValuesIn(n_size_range)));
diff --git a/clients/include/testing_getrs.hpp b/clients/include/testing_getrs.hpp
new file mode 100644
index 000000000..0954930a7
--- /dev/null
+++ b/clients/include/testing_getrs.hpp
@@ -0,0 +1,216 @@
+/* ************************************************************************
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * ************************************************************************ */
+
+#include <cmath> // std::abs
+#include <fstream>
+#include <iostream>
+#include <limits> // std::numeric_limits<T>::epsilon();
+#include <stdlib.h>
+#include <string>
+#include <vector>
+
+#include "arg_check.h"
+#include "cblas_interface.h"
+#include "norm.h"
+#include "rocblas_test_unique_ptr.hpp"
+#include "rocsolver.hpp"
+#include "unit.h"
+#include "utility.h"
+#ifdef GOOGLE_TEST
+#include <gtest/gtest.h>
+#endif
+
+// this is max error PER element after the LU
+#define GETRF_ERROR_EPS_MULTIPLIER 500
+
+using namespace std;
+
+template <typename T> rocblas_status testing_getrf(Arguments argus) {
+
+  rocblas_int M = argus.M;
+  rocblas_int N = argus.N;
+  rocblas_int lda = argus.lda;
+
+  rocblas_int safe_size = 100; // arbitrarily set to 100
+
+  rocblas_int size_A = max(lda, M) * N;
+
+  rocblas_status status;
+
+  std::unique_ptr<rocblas_test::handle_struct> unique_ptr_handle(
+      new rocblas_test::handle_struct);
+  rocblas_handle handle = unique_ptr_handle->handle;
+
+  // check here to prevent undefined memory allocation error
+  if (M < 0 || N < 0 || lda < M) {
+    auto dA_managed =
+        rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(T) * safe_size),
+                           rocblas_test::device_free};
+    T *dA = (T *)dA_managed.get();
+    if (!dA) {
+      PRINT_IF_HIP_ERROR(hipErrorOutOfMemory);
+      return rocblas_status_memory_error;
+    }
+
+    auto dIpiv_managed =
+        rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(int) * min(M, N)),
+                           rocblas_test::device_free};
+    rocblas_int *dIpiv = (rocblas_int *)dIpiv_managed.get();
+
+    status = rocsolver_getrf<T>(handle, M, N, dA, lda, dIpiv);
+
+    getrf_arg_check(status, M, N);
+
+    return status;
+  }
+
+  // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory
+  vector<T> hA(size_A);
+  vector<T> AAT(size_A);
+
+  double gpu_time_used, cpu_time_used;
+  T error_eps_multiplier = GETRF_ERROR_EPS_MULTIPLIER;
+  T eps = std::numeric_limits<T>::epsilon();
+
+  // allocate memory on device
+  auto dA_managed =
+      rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(T) * size_A),
+                         rocblas_test::device_free};
+  T *dA = (T *)dA_managed.get();
+  if (!dA) {
+    PRINT_IF_HIP_ERROR(hipErrorOutOfMemory);
+    return rocblas_status_memory_error;
+  }
+
+  //  initialize full random matrix hA with all entries in [1, 10]
+  rocblas_init<T>(hA, M, N, lda);
+
+  //  pad untouched area into zero
+  for (int i = M; i < lda; i++) {
+    for (int j = 0; j < N; j++) {
+      hA[i + j * lda] = 0.0;
+    }
+  }
+
+  // put it into [0, 1]
+  for (int i = M; i < lda; i++) {
+    for (int j = 0; j < N; j++) {
+      hA[i + j * lda] = (hA[i + j * lda] - 1.0) / 10.0;
+    }
+  }
+
+  // now make it diagonally dominant
+  for (int i = 0; i < min(M, N); i++) {
+    hA[i + i * lda] *= 420.0;
+  }
+
+  // copy data from CPU to device
+  CHECK_HIP_ERROR(
+      hipMemcpy(dA, hA.data(), sizeof(T) * size_A, hipMemcpyHostToDevice));
+
+  // allocate space for the pivoting array
+  vector<int> hIpiv(min(M, N));
+  auto dIpiv_managed =
+      rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(int) * min(M, N)),
+                         rocblas_test::device_free};
+  rocblas_int *dIpiv = (rocblas_int *)dIpiv_managed.get();
+
+  T max_err_1 = 0.0;
+  if (argus.unit_check || argus.norm_check) {
+    // calculate dXorB <- A^(-1) B rocblas_pointer_mode_host
+    const rocblas_status retGPU =
+        rocsolver_getrf<T>(handle, M, N, dA, lda, dIpiv);
+
+    CHECK_HIP_ERROR(
+        hipMemcpy(AAT.data(), dA, sizeof(T) * size_A, hipMemcpyDeviceToHost));
+
+    const int retCBLAS = cblas_getrf<T>(M, N, hA.data(), lda, hIpiv.data());
+
+    if (retCBLAS != 0) {
+      // error encountered - we expect the same to happen from the GPU!
+      if (retGPU == rocblas_status_success) {
+        fprintf(stderr, "rocBLAS should fail also but doesn't!");
+        return rocblas_status_internal_error;
+      }
+    } else {
+      CHECK_ROCBLAS_ERROR(retGPU);
+
+      // Error Check
+
+      // check if the pivoting returned is identical
+      vector<int> hIpivGPU(min(M, N));
+      CHECK_HIP_ERROR(hipMemcpy(hIpivGPU.data(), dIpiv, sizeof(int) * min(M, N),
+                                hipMemcpyDeviceToHost));
+      for (int j = 0; j < min(M, N); j++) {
+        const int refPiv = hIpiv[j];
+        const int gpuPiv = hIpivGPU[j];
+        if (refPiv != gpuPiv) {
+          cerr << "reference pivot " << j << ": " << refPiv << " vs " << gpuPiv
+               << endl;
+          return rocblas_status_internal_error;
+        }
+      }
+
+      // AAT contains calculated decomposition, so error is hA - AAT
+      for (int i = 0; i < M; i++) {
+        for (int j = 0; j < N; j++) {
+          AAT[i + j * lda] = abs(AAT[i + j * lda] - hA[i + j * lda]);
+        }
+      }
+
+      for (int i = 0; i < M; i++) {
+        for (int j = 0; j < N; j++) {
+          max_err_1 =
+              max_err_1 > AAT[i + j * lda] ? max_err_1 : AAT[i + j * lda];
+        }
+      }
+      getrf_err_res_check<T>(max_err_1, M, N, error_eps_multiplier, eps);
+    }
+  }
+
+  if (argus.timing) {
+    // GPU rocBLAS
+    gpu_time_used = get_time_us(); // in microseconds
+
+    const rocblas_status retGPU =
+        rocsolver_getrf<T>(handle, M, N, dA, lda, dIpiv);
+
+    gpu_time_used = get_time_us() - gpu_time_used;
+
+    // CPU cblas
+    cpu_time_used = get_time_us();
+
+    const int retCBLAS = cblas_getrf<T>(M, N, hA.data(), lda, hIpiv.data());
+
+    if (retCBLAS != 0) {
+      // error encountered - we expect the same to happen from the GPU!
+      if (retGPU == rocblas_status_success) {
+        fprintf(stderr, "rocBLAS should fail also but doesn't!");
+      }
+    } else {
+      CHECK_ROCBLAS_ERROR(retGPU);
+    }
+
+    cpu_time_used = get_time_us() - cpu_time_used;
+
+    // only norm_check return an norm error, unit check won't return anything
+    cout << "M , N , lda , us [gpu] , us [cpu]";
+
+    if (argus.norm_check)
+      cout << ",norm_error_host_ptr";
+
+    cout << endl;
+
+    cout << M << " , " << N << " , " << lda << " , " << gpu_time_used << " , "
+         << cpu_time_used;
+
+    if (argus.norm_check)
+      cout << " , " << max_err_1;
+
+    cout << endl;
+  }
+  return rocblas_status_success;
+}
+
+#undef GETRF_ERROR_EPS_MULTIPLIER
diff --git a/library/src/lapack/roclapack_getf2.cpp b/library/src/lapack/roclapack_getf2.cpp
index 4c415b032..3011c9cfe 100644
--- a/library/src/lapack/roclapack_getf2.cpp
+++ b/library/src/lapack/roclapack_getf2.cpp
@@ -14,14 +14,14 @@
  * ===========================================================================
  */
 
-extern "C" rocblas_status rocsolver_sgetf2(rocblas_handle handle, rocblas_int m,
-                                           rocblas_int n, float *A,
-                                           rocblas_int lda, rocblas_int *ipiv) {
+extern "C" ROCSOLVER_EXPORT rocblas_status
+rocsolver_sgetf2(rocblas_handle handle, rocblas_int m, rocblas_int n, float *A,
+                 rocblas_int lda, rocblas_int *ipiv) {
   return rocsolver_getf2_template<float>(handle, m, n, A, lda, ipiv);
 }
 
-extern "C" rocblas_status rocsolver_dgetf2(rocblas_handle handle, rocblas_int m,
-                                           rocblas_int n, double *A,
-                                           rocblas_int lda, rocblas_int *ipiv) {
+extern "C" ROCSOLVER_EXPORT rocblas_status
+rocsolver_dgetf2(rocblas_handle handle, rocblas_int m, rocblas_int n, double *A,
+                 rocblas_int lda, rocblas_int *ipiv) {
   return rocsolver_getf2_template<double>(handle, m, n, A, lda, ipiv);
 }
\ No newline at end of file
diff --git a/library/src/lapack/roclapack_getrf.cpp b/library/src/lapack/roclapack_getrf.cpp
index 24c6f3f79..2c6e817a5 100644
--- a/library/src/lapack/roclapack_getrf.cpp
+++ b/library/src/lapack/roclapack_getrf.cpp
@@ -14,16 +14,14 @@
  * ===========================================================================
  */
 
-extern "C" rocblas_status rocsolver_sgetrf(rocsolver_handle handle,
-                                           rocsolver_int m, rocsolver_int n,
-                                           float *A, rocsolver_int lda,
-                                           rocsolver_int *ipiv) {
+extern "C" ROCSOLVER_EXPORT rocblas_status
+rocsolver_sgetrf(rocsolver_handle handle, rocsolver_int m, rocsolver_int n,
+                 float *A, rocsolver_int lda, rocsolver_int *ipiv) {
   return rocsolver_getrf_template<float>(handle, m, n, A, lda, ipiv);
 }
 
-extern "C" rocblas_status rocsolver_dgetrf(rocsolver_handle handle,
-                                           rocsolver_int m, rocsolver_int n,
-                                           double *A, rocsolver_int lda,
-                                           rocsolver_int *ipiv) {
+extern "C" ROCSOLVER_EXPORT rocblas_status
+rocsolver_dgetrf(rocsolver_handle handle, rocsolver_int m, rocsolver_int n,
+                 double *A, rocsolver_int lda, rocsolver_int *ipiv) {
   return rocsolver_getrf_template<double>(handle, m, n, A, lda, ipiv);
 }
\ No newline at end of file
diff --git a/library/src/lapack/roclapack_getrs.cpp b/library/src/lapack/roclapack_getrs.cpp
new file mode 100644
index 000000000..921716627
--- /dev/null
+++ b/library/src/lapack/roclapack_getrs.cpp
@@ -0,0 +1,31 @@
+/* ************************************************************************
+ * Derived from the BSD2-licensed
+ * LAPACK routine (version 3.1) --
+ *     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
+ *     November 2006
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * ************************************************************************ */
+
+#include "roclapack_getrs.hpp"
+
+/*
+ * ===========================================================================
+ *    C wrapper
+ * ===========================================================================
+ */
+
+extern "C" ROCSOLVER_EXPORT rocblas_status
+rocsolver_sgetrs(rocblas_handle handle, rocblas_operation trans, rocblas_int n,
+                 rocblas_int nrhs, const float *A, rocblas_int lda,
+                 const rocblas_int *ipiv, float *B, rocblas_int ldb) {
+  return rocsolver_getrs_template<float>(handle, trans, n, nrhs, A, lda, ipiv,
+                                         B, ldb);
+}
+
+extern "C" ROCSOLVER_EXPORT rocblas_status
+rocsolver_dgetrs(rocblas_handle handle, rocblas_operation trans, rocblas_int n,
+                 rocblas_int nrhs, const double *A, rocblas_int lda,
+                 const rocblas_int *ipiv, double *B, rocblas_int ldb) {
+  return rocsolver_getrs_template<double>(handle, trans, n, nrhs, A, lda, ipiv,
+                                          B, ldb);
+}
diff --git a/library/src/lapack/roclapack_getrs.hpp b/library/src/lapack/roclapack_getrs.hpp
new file mode 100644
index 000000000..ef1c6e070
--- /dev/null
+++ b/library/src/lapack/roclapack_getrs.hpp
@@ -0,0 +1,90 @@
+/* ************************************************************************
+ * Derived from the BSD2-licensed
+ * LAPACK routine (version 3.1) --
+ *     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
+ *     November 2006
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * ************************************************************************ */
+
+#ifndef ROCLAPACK_GETRS_HPP
+#define ROCLAPACK_GETRS_HPP
+
+#include "rocsolver-export.h"
+#include <hip/hip_runtime.h>
+#include <rocblas.hpp>
+
+#include "ideal_sizes.hpp"
+#include "roclapack_laswp.hpp"
+
+#define GETRS_INPONE 0
+
+template <typename T>
+rocblas_status
+rocsolver_getrs_template(rocblas_handle handle, rocblas_operation trans,
+                         rocblas_int n, rocblas_int nrhs, const T *A,
+                         rocblas_int lda, const rocblas_int *ipiv, T *B,
+                         rocblas_int ldb) {
+
+  // TODO remove const_cast here once rocBLAS is released with the correct API
+
+  // check for possible input problems
+  if (n < 0 || nrhs < 0 || lda < max(1, n) || ldb < max(1, n)) {
+    cout << "Invalid size " << n << " " << nrhs << " " << lda << " " << ldb
+         << endl;
+    return rocblas_status_invalid_size;
+  }
+
+  // quick return
+  if (n == 0 || nrhs == 0) {
+    return rocblas_status_success;
+  }
+
+  T inpsResHost[2];
+  inpsResHost[GETRS_INPONE] = static_cast<T>(1);
+
+  // allocate a tiny bit of memory on device to avoid going onto CPU and needing
+  // to synchronize.
+  T *inpsResGPU;
+  hipMalloc(&inpsResGPU, sizeof(T));
+  hipMemcpy(inpsResGPU, &inpsResHost[0], sizeof(T), hipMemcpyHostToDevice);
+
+  if (trans == rocblas_operation_none) {
+
+    // solve A * X = B
+    // first apply row interchanges to the right hand sides
+    roclapack_laswp_template<T>(handle, nrhs, B, ldb, 1, n, ipiv, 1);
+
+    // solve L*X - B, overwriting B with X
+    rocblas_trsm<T>(handle, rocblas_side_left, rocblas_fill_lower,
+                    rocblas_operation_none, rocblas_diagonal_unit, n, nrhs,
+                    &inpsResGPU[GETRS_INPONE], const_cast<T *>(A), lda, B, ldb);
+
+    // solve U*X = B, overwriting B with X
+    rocblas_trsm<T>(handle, rocblas_side_left, rocblas_fill_upper,
+                    rocblas_operation_none, rocblas_diagonal_non_unit, n, nrhs,
+                    &inpsResGPU[GETRS_INPONE], const_cast<T *>(A), lda, B, ldb);
+  } else {
+
+    // solve A**T * X = B  or A**H * X = B
+    // solve U**T *X = B or U**H *X = B, overwriting B with X
+    rocblas_trsm<T>(handle, rocblas_side_left, rocblas_fill_upper, trans,
+                    rocblas_diagonal_non_unit, n, nrhs,
+                    &inpsResGPU[GETRS_INPONE], const_cast<T *>(A), lda, B, ldb);
+
+    // solve L**T *X = B, or L**H *X = B overwriting B with X
+    rocblas_trsm<T>(handle, rocblas_side_left, rocblas_fill_lower, trans,
+                    rocblas_diagonal_unit, n, nrhs, &inpsResGPU[GETRS_INPONE],
+                    const_cast<T *>(A), lda, B, ldb);
+
+    // apply row interchanges to the solution vectors
+    roclapack_laswp_template<T>(handle, nrhs, B, ldb, 1, n, ipiv, -1);
+  }
+
+  hipFree(inpsResGPU);
+
+  return rocblas_status_success;
+}
+
+#undef GETRS_INPONE
+
+#endif /* ROCLAPACK_GETRS_HPP */
diff --git a/library/src/lapack/roclapack_potf2.cpp b/library/src/lapack/roclapack_potf2.cpp
index 1c1d0ce15..1aa308e3c 100644
--- a/library/src/lapack/roclapack_potf2.cpp
+++ b/library/src/lapack/roclapack_potf2.cpp
@@ -14,14 +14,14 @@
  * ===========================================================================
  */
 
-extern "C" rocblas_status rocsolver_spotf2(rocblas_handle handle,
-                                           rocblas_fill uplo, rocblas_int n,
-                                           float *A, rocblas_int lda) {
+extern "C" ROCSOLVER_EXPORT rocblas_status
+rocsolver_spotf2(rocblas_handle handle, rocblas_fill uplo, rocblas_int n,
+                 float *A, rocblas_int lda) {
   return rocsolver_potf2_template<float>(handle, uplo, n, A, lda);
 }
 
-extern "C" rocblas_status rocsolver_dpotf2(rocblas_handle handle,
-                                           rocblas_fill uplo, rocblas_int n,
-                                           double *A, rocblas_int lda) {
+extern "C" ROCSOLVER_EXPORT rocblas_status
+rocsolver_dpotf2(rocblas_handle handle, rocblas_fill uplo, rocblas_int n,
+                 double *A, rocblas_int lda) {
   return rocsolver_potf2_template<double>(handle, uplo, n, A, lda);
 }

From cffb24a13cd1f87b41593fceca8973fd599ce6fc Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <Johannes.Dieterich@amd.com>
Date: Tue, 19 Jun 2018 09:28:22 -0500
Subject: [PATCH 3/4] Mark rocsolver-bench as a clang-tidy target.

---
 clients/benchmarks/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clients/benchmarks/CMakeLists.txt b/clients/benchmarks/CMakeLists.txt
index 1836c2fe4..417075862 100755
--- a/clients/benchmarks/CMakeLists.txt
+++ b/clients/benchmarks/CMakeLists.txt
@@ -39,6 +39,8 @@ set( rocsolver_benchmark_common
 add_executable( rocsolver-bench client.cpp ${rocsolver_benchmark_common} )
 target_compile_features( rocsolver-bench PRIVATE cxx_static_assert cxx_nullptr cxx_auto_type )
 
+rocm_clang_tidy_check(rocsolver-bench)
+
 if( BUILD_WITH_TENSILE )
     target_compile_definitions( rocsolver-bench PRIVATE BUILD_WITH_TENSILE=1 )
 else()

From 6b45238d86eed093f258e68d59a3039a295f0ae8 Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <Johannes.Dieterich@amd.com>
Date: Tue, 19 Jun 2018 09:31:58 -0500
Subject: [PATCH 4/4] Add a solver for systems of linear equations to
 rocSOLVER.

This satisfies the sgetrs/dgetrs LAPACK API. Some trivial
performance tests show a speedup of >300 compared to the
reference LAPACK implementation on a Vega10 for large matrix
sizes in single precision.
---
 clients/benchmarks/client.cpp          |   8 +-
 clients/common/arg_check.cpp           |  21 ++++
 clients/common/cblas_interface.cpp     |  50 +++++++++
 clients/common/unit.cpp                |  16 +++
 clients/gtest/CMakeLists.txt           |   3 +
 clients/gtest/getrs_gtest.cpp          |  74 +++++++------
 clients/include/arg_check.h            |   3 +
 clients/include/cblas_interface.h      |   5 +
 clients/include/rocsolver.hpp          |  22 ++++
 clients/include/testing_getrs.hpp      | 139 ++++++++++++++++---------
 clients/include/unit.h                 |   4 +
 library/include/rocsolver-functions.h  | 105 +++++++++++++++++++
 library/src/CMakeLists.txt             |   6 +-
 library/src/lapack/roclapack_laswp.hpp |  24 +++--
 14 files changed, 381 insertions(+), 99 deletions(-)

diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp
index 01f18da47..3f78e677e 100644
--- a/clients/benchmarks/client.cpp
+++ b/clients/benchmarks/client.cpp
@@ -8,6 +8,7 @@
 
 #include "testing_getf2.hpp"
 #include "testing_getrf.hpp"
+#include "testing_getrs.hpp"
 #include "testing_potf2.hpp"
 #include "utility.h"
 
@@ -99,7 +100,7 @@ int main(int argc, char *argv[]) {
               
         ("function,f",
          po::value<std::string>(&function)->default_value("potf2"),
-         "LAPACK function to test. Options: potf2, getf2, getrf")
+         "LAPACK function to test. Options: potf2, getf2, getrf, getrs")
         
         ("precision,r", 
          po::value<char>(&precision)->default_value('s'), "Options: h,s,d,c,z")
@@ -191,6 +192,11 @@ int main(int argc, char *argv[]) {
       testing_getrf<float>(argus);
     else if (precision == 'd')
       testing_getrf<double>(argus);
+  } else if (function == "getrs") {
+    if (precision == 's')
+      testing_getrs<float>(argus);
+    else if (precision == 'd')
+      testing_getrs<double>(argus);
   } else {
     printf("Invalid value for --function \n");
     return -1;
diff --git a/clients/common/arg_check.cpp b/clients/common/arg_check.cpp
index b489e5f1a..346c87833 100644
--- a/clients/common/arg_check.cpp
+++ b/clients/common/arg_check.cpp
@@ -79,6 +79,27 @@ void getrf_arg_check(rocblas_status status, rocblas_int M, rocblas_int N) {
 #endif
 }
 
+void getrs_arg_check(rocblas_status status, rocblas_int M, rocblas_int nhrs,
+                     rocblas_int lda, rocblas_int ldb) {
+#ifdef GOOGLE_TEST
+  if (M < 0 || nhrs < 0 || lda < std::max(1, M) || ldb < std::max(1, M)) {
+    ASSERT_EQ(status, rocblas_status_invalid_size);
+  } else {
+    ASSERT_EQ(status, rocblas_status_success);
+  }
+#else
+  if (M < 0 || nhrs < 0 || lda < std::max(1, M) || ldb < std::max(1, M)) {
+    if (status != rocblas_status_invalid_size)
+      std::cerr << "result should be invalid size for size " << M << " and "
+                << nhrs << std::endl;
+  } else {
+    if (status != rocblas_status_success)
+      std::cerr << "result should be success for size " << M << " and " << nhrs
+                << std::endl;
+  }
+#endif
+}
+
 void verify_rocblas_status_invalid_pointer(rocblas_status status,
                                            const char *message) {
 #ifdef GOOGLE_TEST
diff --git a/clients/common/cblas_interface.cpp b/clients/common/cblas_interface.cpp
index b7670fad1..1b32e116a 100644
--- a/clients/common/cblas_interface.cpp
+++ b/clients/common/cblas_interface.cpp
@@ -51,6 +51,16 @@ void cgetf2_(int *m, int *n, rocblas_float_complex *A, int *lda, int *ipiv,
 void zgetf2_(int *m, int *n, rocblas_double_complex *A, int *lda, int *ipiv,
              int *info);
 
+void sgetrs_(char *trans, int *n, int *nrhs, float *A, int *lda, int *ipiv,
+             float *B, int *ldb, int *info);
+void dgetrs_(char *trans, int *n, int *nrhs, double *A, int *lda, int *ipiv,
+             double *B, int *ldb, int *info);
+void cgetrs_(char *trans, int *n, int *nrhs, rocblas_float_complex *A, int *lda,
+             int *ipiv, rocblas_float_complex *B, int *ldb, int *info);
+void zgetrs_(char *trans, int *n, int *nrhs, rocblas_double_complex *A,
+             int *lda, int *ipiv, rocblas_double_complex *B, int *ldb,
+             int *info);
+
 #ifdef __cplusplus
 }
 #endif
@@ -743,3 +753,43 @@ rocblas_int cblas_potrf<rocblas_double_complex>(char uplo, rocblas_int m,
   zpotrf_(&uplo, &m, A, &lda, &info);
   return info;
 }
+
+// getrs
+template <>
+rocblas_int cblas_getrs<float>(char trans, rocblas_int n, rocblas_int nrhs,
+                               float *A, rocblas_int lda, rocblas_int *ipiv,
+                               float *B, rocblas_int ldb) {
+  rocblas_int info;
+  sgetrs_(&trans, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info);
+  return info;
+}
+
+template <>
+rocblas_int cblas_getrs<double>(char trans, rocblas_int n, rocblas_int nrhs,
+                                double *A, rocblas_int lda, rocblas_int *ipiv,
+                                double *B, rocblas_int ldb) {
+  rocblas_int info;
+  dgetrs_(&trans, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info);
+  return info;
+}
+
+template <>
+rocblas_int
+cblas_getrs<rocblas_float_complex>(char trans, rocblas_int n, rocblas_int nrhs,
+                                   rocblas_float_complex *A, rocblas_int lda,
+                                   rocblas_int *ipiv, rocblas_float_complex *B,
+                                   rocblas_int ldb) {
+  rocblas_int info;
+  cgetrs_(&trans, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info);
+  return info;
+}
+
+template <>
+rocblas_int cblas_getrs<rocblas_double_complex>(
+    char trans, rocblas_int n, rocblas_int nrhs, rocblas_double_complex *A,
+    rocblas_int lda, rocblas_int *ipiv, rocblas_double_complex *B,
+    rocblas_int ldb) {
+  rocblas_int info;
+  zgetrs_(&trans, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info);
+  return info;
+}
\ No newline at end of file
diff --git a/clients/common/unit.cpp b/clients/common/unit.cpp
index fec53fcc7..29cf0b1f8 100644
--- a/clients/common/unit.cpp
+++ b/clients/common/unit.cpp
@@ -182,4 +182,20 @@ void getrf_err_res_check(double max_error, rocblas_int M, rocblas_int N,
 #ifdef GOOGLE_TEST
   ASSERT_LE(max_error, forward_tolerance * eps);
 #endif
+}
+
+template <>
+void getrs_err_res_check(float max_error, rocblas_int M, rocblas_int nhrs,
+                         float forward_tolerance, float eps) {
+#ifdef GOOGLE_TEST
+  ASSERT_LE(max_error, forward_tolerance * eps);
+#endif
+}
+
+template <>
+void getrs_err_res_check(double max_error, rocblas_int M, rocblas_int nhrs,
+                         double forward_tolerance, double eps) {
+#ifdef GOOGLE_TEST
+  ASSERT_LE(max_error, forward_tolerance * eps);
+#endif
 }
\ No newline at end of file
diff --git a/clients/gtest/CMakeLists.txt b/clients/gtest/CMakeLists.txt
index 105c598a0..def7c606c 100755
--- a/clients/gtest/CMakeLists.txt
+++ b/clients/gtest/CMakeLists.txt
@@ -35,6 +35,7 @@ find_package( Threads REQUIRED )
 set(roclapack_test_source
     getf2_gtest.cpp
     getrf_gtest.cpp
+    getrs_gtest.cpp
     potf2_gtest.cpp
     )
 
@@ -54,6 +55,8 @@ set( rocsolver_benchmark_common
 add_executable( rocsolver-test ${roclapack_test_source} ${rocsolver_test_source} ${rocsolver_benchmark_common} )
 target_compile_features( rocsolver-test PRIVATE cxx_static_assert cxx_nullptr cxx_auto_type )
 
+rocm_clang_tidy_check(rocsolver-test)
+
 target_compile_definitions( rocsolver-test PRIVATE BUILD_WITH_TENSILE=0 GOOGLE_TEST )
 
 # Internal header includes
diff --git a/clients/gtest/getrs_gtest.cpp b/clients/gtest/getrs_gtest.cpp
index 8289ab102..80e2ad899 100644
--- a/clients/gtest/getrs_gtest.cpp
+++ b/clients/gtest/getrs_gtest.cpp
@@ -3,7 +3,7 @@
  *
  * ************************************************************************ */
 
-#include "testing_getrf.hpp"
+#include "testing_getrs.hpp"
 #include "utility.h"
 #include <gtest/gtest.h>
 #include <math.h>
@@ -19,7 +19,7 @@ using namespace std;
 // only GCC/VS 2010 comes with std::tr1::tuple, but it is unnecessary,
 // std::tuple is good enough;
 
-typedef std::tuple<vector<int>, int> getrf_tuple;
+typedef std::tuple<vector<int>, vector<int>, char> getrs_tuple;
 
 /* =====================================================================
 README: This file contains testers to verify the correctness of
@@ -41,21 +41,27 @@ necessary
 
 // vector of vector, each vector is a {M, lda};
 // add/delete as a group
-const vector<vector<int>> matrix_size_range = {
+const vector<vector<int>> matrix_sizeA_range = {
     {-1, 1}, {10, 10}, {10, 20}, {500, 500}, {500, 750},
 };
 
-// each is a N
-const vector<int> n_size_range = {
-    1, 20, 40, 600, 600,
+// vector of vector, each vector is a {M, lda};
+// add/delete as a group
+const vector<vector<int>> matrix_sizeB_range = {
+    {-1, 1}, {10, 10}, {10, 20}, {500, 500}, {500, 750},
+};
+
+const vector<vector<int>> large_matrix_sizeA_range = {
+    {192, 192}, {640, 640}, {1000, 1000}, {1024, 1024}, {2000, 2000},
 };
 
-const vector<vector<int>> large_matrix_size_range = {
+const vector<vector<int>> large_matrix_sizeB_range = {
     {192, 192}, {640, 640}, {1000, 1000}, {1024, 1024}, {2000, 2000},
 };
 
-const vector<int> large_n_size_range = {
-    192, 640, 1000, 1024, 2000,
+const vector<char> transpose = {
+    'N',
+    'T',
 };
 
 /* ===============Google Unit
@@ -76,40 +82,42 @@ const vector<int> large_n_size_range = {
 // by std:tuple, you have unpack it with extreme care for each one by like
 // "std::get<0>" which is not intuitive and error-prone
 
-Arguments setup_getrf_arguments(getrf_tuple tup) {
+Arguments setup_getrs_arguments(getrs_tuple tup) {
 
-  vector<int> matrix_size = std::get<0>(tup);
-  int n_size = std::get<1>(tup);
+  vector<int> matrix_sizeA = std::get<0>(tup);
+  vector<int> matrix_sizeB = std::get<1>(tup);
 
   Arguments arg;
 
   // see the comments about matrix_size_range above
-  arg.M = matrix_size[0];
-  arg.N = n_size;
-  arg.lda = matrix_size[1];
+  arg.M = matrix_sizeA[0];
+  arg.N = matrix_sizeB[0];
+  arg.lda = matrix_sizeA[1];
+  arg.ldb = matrix_sizeB[1];
+  arg.transA_option = std::get<2>(tup);
 
   arg.timing = 0;
 
   return arg;
 }
 
-class getrf_gtest : public ::TestWithParam<getrf_tuple> {
+class getrs_gtest : public ::TestWithParam<getrs_tuple> {
 protected:
-  getrf_gtest() {}
-  virtual ~getrf_gtest() {}
+  getrs_gtest() {}
+  virtual ~getrs_gtest() {}
   virtual void SetUp() {}
   virtual void TearDown() {}
 };
 
-TEST_P(getrf_gtest, getrf_gtest_float) {
+TEST_P(getrs_gtest, getrs_gtest_float) {
   // GetParam return a tuple. Tee setup routine unpack the tuple
   // and initializes arg(Arguments) which will be passed to testing routine
   // The Arguments data struture have physical meaning associated.
   // while the tuple is non-intuitive.
 
-  Arguments arg = setup_getrf_arguments(GetParam());
+  Arguments arg = setup_getrs_arguments(GetParam());
 
-  rocblas_status status = testing_getrf<float>(arg);
+  rocblas_status status = testing_getrs<float>(arg);
 
   // if not success, then the input argument is problematic, so detect the error
   // message
@@ -117,21 +125,21 @@ TEST_P(getrf_gtest, getrf_gtest_float) {
 
     if (arg.M < 0 || arg.N < 0) {
       EXPECT_EQ(rocblas_status_invalid_size, status);
-    } else if (arg.lda < arg.M) {
+    } else if (arg.lda < arg.M || arg.ldb < arg.M) {
       EXPECT_EQ(rocblas_status_invalid_size, status);
     }
   }
 }
 
-TEST_P(getrf_gtest, getrf_gtest_double) {
+TEST_P(getrs_gtest, getrs_gtest_double) {
   // GetParam return a tuple. Tee setup routine unpack the tuple
   // and initializes arg(Arguments) which will be passed to testing routine
   // The Arguments data struture have physical meaning associated.
   // while the tuple is non-intuitive.
 
-  Arguments arg = setup_getrf_arguments(GetParam());
+  Arguments arg = setup_getrs_arguments(GetParam());
 
-  rocblas_status status = testing_getrf<double>(arg);
+  rocblas_status status = testing_getrs<double>(arg);
 
   // if not success, then the input argument is problematic, so detect the error
   // message
@@ -139,7 +147,7 @@ TEST_P(getrf_gtest, getrf_gtest_double) {
 
     if (arg.M < 0 || arg.N < 0) {
       EXPECT_EQ(rocblas_status_invalid_size, status);
-    } else if (arg.lda < arg.M) {
+    } else if (arg.lda < arg.M || arg.ldb < arg.M) {
       EXPECT_EQ(rocblas_status_invalid_size, status);
     }
   }
@@ -151,12 +159,14 @@ TEST_P(getrf_gtest, getrf_gtest_double) {
 // test_p The combinations are  { {M, N, lda}}
 
 // This function mainly test the scope of matrix_size.
-INSTANTIATE_TEST_CASE_P(daily_lapack, getrf_gtest,
-                        Combine(ValuesIn(large_matrix_size_range),
-                                ValuesIn(large_n_size_range)));
+INSTANTIATE_TEST_CASE_P(daily_lapack, getrs_gtest,
+                        Combine(ValuesIn(large_matrix_sizeA_range),
+                                ValuesIn(large_matrix_sizeB_range),
+                                ValuesIn(transpose)));
 
 // THis function mainly test the scope of uplo_range, the scope of
 // matrix_size_range is small
-INSTANTIATE_TEST_CASE_P(checkin_lapack, getrf_gtest,
-                        Combine(ValuesIn(matrix_size_range),
-                                ValuesIn(n_size_range)));
+INSTANTIATE_TEST_CASE_P(checkin_lapack, getrs_gtest,
+                        Combine(ValuesIn(matrix_sizeA_range),
+                                ValuesIn(matrix_sizeA_range),
+                                ValuesIn(transpose)));
diff --git a/clients/include/arg_check.h b/clients/include/arg_check.h
index c7af7233e..7b8911aff 100644
--- a/clients/include/arg_check.h
+++ b/clients/include/arg_check.h
@@ -35,6 +35,9 @@ void getf2_arg_check(rocsolver_status status, rocsolver_int M, rocsolver_int N);
 
 void getrf_arg_check(rocsolver_status status, rocsolver_int M, rocsolver_int N);
 
+void getrs_arg_check(rocsolver_status status, rocsolver_int M,
+                     rocsolver_int nhrs, rocblas_int lda, rocblas_int ldb);
+
 template <typename T> void verify_not_nan(T arg);
 
 template <typename T> void verify_equal(T arg1, T arg2, const char *message);
diff --git a/clients/include/cblas_interface.h b/clients/include/cblas_interface.h
index be8a9aaa6..1359a41ce 100644
--- a/clients/include/cblas_interface.h
+++ b/clients/include/cblas_interface.h
@@ -105,6 +105,11 @@ template <typename T>
 rocblas_int cblas_getrf(rocblas_int m, rocblas_int n, T *A, rocblas_int lda,
                         rocblas_int *ipiv);
 
+template <typename T>
+rocblas_int cblas_getrs(char trans, rocblas_int n, rocblas_int nrhs, T *A,
+                        rocblas_int lda, rocblas_int *ipiv, T *B,
+                        rocblas_int ldb);
+
 template <typename T>
 rocblas_int cblas_potrf(char uplo, rocblas_int m, T *A, rocblas_int lda);
 /* ============================================================================================
diff --git a/clients/include/rocsolver.hpp b/clients/include/rocsolver.hpp
index ecbcf0e66..34e22e910 100644
--- a/clients/include/rocsolver.hpp
+++ b/clients/include/rocsolver.hpp
@@ -59,4 +59,26 @@ inline rocblas_status rocsolver_getrf(rocblas_handle handle, rocblas_int m,
   return rocsolver_dgetrf(handle, m, n, A, lda, ipiv);
 }
 
+template <typename T>
+inline rocblas_status
+rocsolver_getrs(rocblas_handle handle, rocblas_operation trans, rocblas_int n,
+                rocblas_int nrhs, const T *A, rocblas_int lda,
+                const rocblas_int *ipiv, T *B, rocblas_int ldb);
+
+template <>
+inline rocblas_status
+rocsolver_getrs(rocblas_handle handle, rocblas_operation trans, rocblas_int n,
+                rocblas_int nrhs, const float *A, rocblas_int lda,
+                const rocblas_int *ipiv, float *B, rocblas_int ldb) {
+  return rocsolver_sgetrs(handle, trans, n, nrhs, A, lda, ipiv, B, ldb);
+}
+
+template <>
+inline rocblas_status
+rocsolver_getrs(rocblas_handle handle, rocblas_operation trans, rocblas_int n,
+                rocblas_int nrhs, const double *A, rocblas_int lda,
+                const rocblas_int *ipiv, double *B, rocblas_int ldb) {
+  return rocsolver_dgetrs(handle, trans, n, nrhs, A, lda, ipiv, B, ldb);
+}
+
 #endif /* ROCSOLVER_HPP */
diff --git a/clients/include/testing_getrs.hpp b/clients/include/testing_getrs.hpp
index 0954930a7..efa0d0d53 100644
--- a/clients/include/testing_getrs.hpp
+++ b/clients/include/testing_getrs.hpp
@@ -21,20 +21,32 @@
 #include <gtest/gtest.h>
 #endif
 
-// this is max error PER element after the LU
+// this is max error PER element after the solution
 #define GETRF_ERROR_EPS_MULTIPLIER 500
 
 using namespace std;
 
-template <typename T> rocblas_status testing_getrf(Arguments argus) {
+template <typename T> rocblas_status testing_getrs(Arguments argus) {
 
   rocblas_int M = argus.M;
-  rocblas_int N = argus.N;
+  rocblas_int nhrs = argus.N;
   rocblas_int lda = argus.lda;
+  rocblas_int ldb = argus.ldb;
+  char trans = argus.transA_option;
+
+  rocblas_operation transRoc;
+  if (trans == 'N') {
+    transRoc = rocblas_operation_none;
+  } else if (trans == 'T') {
+    transRoc = rocblas_operation_transpose;
+  } else {
+    throw runtime_error("Unsupported transpose operation.");
+  }
 
   rocblas_int safe_size = 100; // arbitrarily set to 100
 
-  rocblas_int size_A = max(lda, M) * N;
+  rocblas_int size_A = max(lda, M) * M;
+  rocblas_int size_B = max(ldb, M) * nhrs;
 
   rocblas_status status;
 
@@ -43,7 +55,7 @@ template <typename T> rocblas_status testing_getrf(Arguments argus) {
   rocblas_handle handle = unique_ptr_handle->handle;
 
   // check here to prevent undefined memory allocation error
-  if (M < 0 || N < 0 || lda < M) {
+  if (M < 0 || nhrs < 0 || lda < std::max(1, M) || ldb < std::max(1, M)) {
     auto dA_managed =
         rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(T) * safe_size),
                            rocblas_test::device_free};
@@ -53,21 +65,32 @@ template <typename T> rocblas_status testing_getrf(Arguments argus) {
       return rocblas_status_memory_error;
     }
 
+    auto dB_managed =
+        rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(T) * safe_size),
+                           rocblas_test::device_free};
+    T *dB = (T *)dB_managed.get();
+    if (!dB) {
+      PRINT_IF_HIP_ERROR(hipErrorOutOfMemory);
+      return rocblas_status_memory_error;
+    }
+
     auto dIpiv_managed =
-        rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(int) * min(M, N)),
+        rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(int) * M),
                            rocblas_test::device_free};
     rocblas_int *dIpiv = (rocblas_int *)dIpiv_managed.get();
 
-    status = rocsolver_getrf<T>(handle, M, N, dA, lda, dIpiv);
+    status =
+        rocsolver_getrs<T>(handle, transRoc, M, nhrs, dA, lda, dIpiv, dB, ldb);
 
-    getrf_arg_check(status, M, N);
+    getrs_arg_check(status, M, nhrs, lda, ldb);
 
     return status;
   }
 
   // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory
   vector<T> hA(size_A);
-  vector<T> AAT(size_A);
+  vector<T> hB(size_B);
+  vector<T> hBRes(size_B);
 
   double gpu_time_used, cpu_time_used;
   T error_eps_multiplier = GETRF_ERROR_EPS_MULTIPLIER;
@@ -83,49 +106,74 @@ template <typename T> rocblas_status testing_getrf(Arguments argus) {
     return rocblas_status_memory_error;
   }
 
-  //  initialize full random matrix hA with all entries in [1, 10]
-  rocblas_init<T>(hA, M, N, lda);
+  auto dB_managed =
+      rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(T) * size_B),
+                         rocblas_test::device_free};
+  T *dB = (T *)dB_managed.get();
+  if (!dB) {
+    PRINT_IF_HIP_ERROR(hipErrorOutOfMemory);
+    return rocblas_status_memory_error;
+  }
+
+  //  initialize full random matrix hA, hB with all entries in [1, 10]
+  rocblas_init<T>(hA, M, M, lda);
+  rocblas_init<T>(hB, M, nhrs, ldb);
 
   //  pad untouched area into zero
   for (int i = M; i < lda; i++) {
-    for (int j = 0; j < N; j++) {
+    for (int j = 0; j < M; j++) {
       hA[i + j * lda] = 0.0;
     }
   }
+  for (int i = M; i < ldb; i++) {
+    for (int j = 0; j < nhrs; j++) {
+      hB[i + j * ldb] = 0.0;
+    }
+  }
 
   // put it into [0, 1]
   for (int i = M; i < lda; i++) {
-    for (int j = 0; j < N; j++) {
+    for (int j = 0; j < M; j++) {
       hA[i + j * lda] = (hA[i + j * lda] - 1.0) / 10.0;
     }
   }
 
   // now make it diagonally dominant
-  for (int i = 0; i < min(M, N); i++) {
+  for (int i = 0; i < M; i++) {
     hA[i + i * lda] *= 420.0;
   }
 
-  // copy data from CPU to device
-  CHECK_HIP_ERROR(
-      hipMemcpy(dA, hA.data(), sizeof(T) * size_A, hipMemcpyHostToDevice));
-
   // allocate space for the pivoting array
-  vector<int> hIpiv(min(M, N));
-  auto dIpiv_managed =
-      rocblas_unique_ptr{rocblas_test::device_malloc(sizeof(int) * min(M, N)),
-                         rocblas_test::device_free};
+  vector<int> hIpiv(M);
+  auto dIpiv_managed = rocblas_unique_ptr{
+      rocblas_test::device_malloc(sizeof(int) * M), rocblas_test::device_free};
   rocblas_int *dIpiv = (rocblas_int *)dIpiv_managed.get();
 
+  // do the LU decomposition of matrix A w/ the reference LAPACK routine
+  const int retCBLAS = cblas_getrf<T>(M, M, hA.data(), lda, hIpiv.data());
+  if (retCBLAS != 0) {
+    // error encountered - unlucky pick of random numbers? no use to continue
+    return rocblas_status_success;
+  }
+
+  // now copy pivoting indices and matrices to the GPU
+  CHECK_HIP_ERROR(
+      hipMemcpy(dA, hA.data(), sizeof(T) * size_A, hipMemcpyHostToDevice));
+  CHECK_HIP_ERROR(
+      hipMemcpy(dB, hB.data(), sizeof(T) * size_B, hipMemcpyHostToDevice));
+  CHECK_HIP_ERROR(
+      hipMemcpy(dIpiv, hIpiv.data(), sizeof(int) * M, hipMemcpyHostToDevice));
+
   T max_err_1 = 0.0;
   if (argus.unit_check || argus.norm_check) {
-    // calculate dXorB <- A^(-1) B rocblas_pointer_mode_host
     const rocblas_status retGPU =
-        rocsolver_getrf<T>(handle, M, N, dA, lda, dIpiv);
+        rocsolver_getrs<T>(handle, transRoc, M, nhrs, dA, lda, dIpiv, dB, ldb);
 
     CHECK_HIP_ERROR(
-        hipMemcpy(AAT.data(), dA, sizeof(T) * size_A, hipMemcpyDeviceToHost));
+        hipMemcpy(hBRes.data(), dB, sizeof(T) * size_B, hipMemcpyDeviceToHost));
 
-    const int retCBLAS = cblas_getrf<T>(M, N, hA.data(), lda, hIpiv.data());
+    const int retCBLAS = cblas_getrs<T>(trans, M, nhrs, hA.data(), lda,
+                                        hIpiv.data(), hB.data(), ldb);
 
     if (retCBLAS != 0) {
       // error encountered - we expect the same to happen from the GPU!
@@ -138,34 +186,20 @@ template <typename T> rocblas_status testing_getrf(Arguments argus) {
 
       // Error Check
 
-      // check if the pivoting returned is identical
-      vector<int> hIpivGPU(min(M, N));
-      CHECK_HIP_ERROR(hipMemcpy(hIpivGPU.data(), dIpiv, sizeof(int) * min(M, N),
-                                hipMemcpyDeviceToHost));
-      for (int j = 0; j < min(M, N); j++) {
-        const int refPiv = hIpiv[j];
-        const int gpuPiv = hIpivGPU[j];
-        if (refPiv != gpuPiv) {
-          cerr << "reference pivot " << j << ": " << refPiv << " vs " << gpuPiv
-               << endl;
-          return rocblas_status_internal_error;
-        }
-      }
-
-      // AAT contains calculated decomposition, so error is hA - AAT
+      // hBRes contains calculated decomposition, so error is hBres - hB
       for (int i = 0; i < M; i++) {
-        for (int j = 0; j < N; j++) {
-          AAT[i + j * lda] = abs(AAT[i + j * lda] - hA[i + j * lda]);
+        for (int j = 0; j < nhrs; j++) {
+          hBRes[i + j * ldb] = abs(hBRes[i + j * ldb] - hB[i + j * ldb]);
         }
       }
 
       for (int i = 0; i < M; i++) {
-        for (int j = 0; j < N; j++) {
+        for (int j = 0; j < nhrs; j++) {
           max_err_1 =
-              max_err_1 > AAT[i + j * lda] ? max_err_1 : AAT[i + j * lda];
+              max_err_1 > hBRes[i + j * ldb] ? max_err_1 : hBRes[i + j * ldb];
         }
       }
-      getrf_err_res_check<T>(max_err_1, M, N, error_eps_multiplier, eps);
+      getrs_err_res_check<T>(max_err_1, M, nhrs, error_eps_multiplier, eps);
     }
   }
 
@@ -174,14 +208,15 @@ template <typename T> rocblas_status testing_getrf(Arguments argus) {
     gpu_time_used = get_time_us(); // in microseconds
 
     const rocblas_status retGPU =
-        rocsolver_getrf<T>(handle, M, N, dA, lda, dIpiv);
+        rocsolver_getrs<T>(handle, transRoc, M, nhrs, dA, lda, dIpiv, dB, ldb);
 
     gpu_time_used = get_time_us() - gpu_time_used;
 
     // CPU cblas
     cpu_time_used = get_time_us();
 
-    const int retCBLAS = cblas_getrf<T>(M, N, hA.data(), lda, hIpiv.data());
+    const int retCBLAS = cblas_getrs<T>(trans, M, nhrs, hA.data(), lda,
+                                        hIpiv.data(), hB.data(), ldb);
 
     if (retCBLAS != 0) {
       // error encountered - we expect the same to happen from the GPU!
@@ -195,15 +230,15 @@ template <typename T> rocblas_status testing_getrf(Arguments argus) {
     cpu_time_used = get_time_us() - cpu_time_used;
 
     // only norm_check return an norm error, unit check won't return anything
-    cout << "M , N , lda , us [gpu] , us [cpu]";
+    cout << "M , nhrs , lda , ldb , us [gpu] , us [cpu]";
 
     if (argus.norm_check)
-      cout << ",norm_error_host_ptr";
+      cout << ", norm_error_host_ptr";
 
     cout << endl;
 
-    cout << M << " , " << N << " , " << lda << " , " << gpu_time_used << " , "
-         << cpu_time_used;
+    cout << M << " , " << nhrs << " , " << lda << " , " << ldb << " , "
+         << gpu_time_used << " , " << cpu_time_used;
 
     if (argus.norm_check)
       cout << " , " << max_err_1;
diff --git a/clients/include/unit.h b/clients/include/unit.h
index dc584d79f..fa429d6e1 100644
--- a/clients/include/unit.h
+++ b/clients/include/unit.h
@@ -45,6 +45,10 @@ template <typename T>
 void getf2_err_res_check(T max_error, rocblas_int M, rocblas_int N,
                          T forward_tolerance, T eps);
 
+template <typename T>
+void getrs_err_res_check(T max_error, rocblas_int M, rocblas_int nhrs,
+                         T forward_tolerance, T eps);
+
 template <typename T>
 void getrf_err_res_check(T max_error, rocblas_int M, rocblas_int N,
                          T forward_tolerance, T eps);
diff --git a/library/include/rocsolver-functions.h b/library/include/rocsolver-functions.h
index 185f21c42..3c904b044 100644
--- a/library/include/rocsolver-functions.h
+++ b/library/include/rocsolver-functions.h
@@ -263,6 +263,111 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf(rocsolver_handle handle,
                                                    rocsolver_int lda,
                                                    rocsolver_int *ipiv);
 
+/*! \brief LAPACK API
+
+  \details
+  getrs solves a system of linear equations
+     A * X = B,  A**T * X = B,  or  A**H * X = B
+  with a general N-by-N matrix A using the LU factorization computed
+  by getrf.
+
+  @param[in]
+  trans
+           Specifies the form of the system of equations:
+           = 'N':  A * X = B     (No transpose)
+           = 'T':  A**T * X = B  (Transpose)
+           = 'C':  A**H * X = B  (Conjugate transpose)
+
+  @param[in]
+  n
+           The order of the matrix A.  N >= 0.
+
+  @param[in]
+  nrhs
+           The number of right hand sides, i.e., the number of columns
+           of the matrix B.  nrhs >= 0.
+
+  @param[in]
+  A
+           pointer storing matrix A on the GPU.
+
+  @param[in]
+  lda
+           The leading dimension of the array A.  lda >= max(1,n).
+
+  @param[in]
+  ipiv
+           The pivot indices from getrf; for 1<=i<=n, row i of the
+           matrix was interchanged with row ipiv(i). Assumes one-based indices!
+
+  @param[in,out]
+  B
+           pointer storing matrix B on the GPU., dimension (ldb,nrhs)
+           On entry, the right hand side matrix B.
+           On exit, the solution matrix X.
+
+  @param[in]
+  ldb
+           The leading dimension of the array B.  ldb >= max(1,n).
+
+   ********************************************************************/
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrs(
+    rocsolver_handle handle, rocsolver_operation trans, rocsolver_int n,
+    rocsolver_int nrhs, const float *A, rocsolver_int lda,
+    const rocsolver_int *ipiv, float *B, rocsolver_int ldb);
+
+/*! \brief LAPACK API
+
+  \details
+  getrs solves a system of linear equations
+     A * X = B,  A**T * X = B,  or  A**H * X = B
+  with a general N-by-N matrix A using the LU factorization computed
+  by getrf.
+
+  @param[in]
+  trans
+           Specifies the form of the system of equations:
+           = 'N':  A * X = B     (No transpose)
+           = 'T':  A**T * X = B  (Transpose)
+           = 'C':  A**H * X = B  (Conjugate transpose)
+
+  @param[in]
+  n
+           The order of the matrix A.  N >= 0.
+
+  @param[in]
+  nrhs
+           The number of right hand sides, i.e., the number of columns
+           of the matrix B.  nrhs >= 0.
+
+  @param[in]
+  A
+           pointer storing matrix A on the GPU.
+
+  @param[in]
+  lda
+           The leading dimension of the array A.  lda >= max(1,n).
+
+  @param[in]
+  ipiv
+           The pivot indices from getrf; for 1<=i<=n, row i of the
+           matrix was interchanged with row ipiv(i). Assumes one-based indices!
+
+  @param[in,out]
+  B
+           pointer storing matrix B on the GPU., dimension (ldb,nrhs)
+           On entry, the right hand side matrix B.
+           On exit, the solution matrix X.
+
+  @param[in]
+  ldb
+           The leading dimension of the array B.  ldb >= max(1,n).
+
+   ********************************************************************/
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrs(
+    rocsolver_handle handle, rocsolver_operation trans, rocsolver_int n,
+    rocsolver_int nrhs, const double *A, rocsolver_int lda,
+    const rocsolver_int *ipiv, double *B, rocsolver_int ldb);
 #ifdef __cplusplus
 }
 #endif
diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt
index 4469734e3..b6c2c4a78 100755
--- a/library/src/CMakeLists.txt
+++ b/library/src/CMakeLists.txt
@@ -33,6 +33,7 @@ set( rocsolver_lapack_source
   lapack/rocblas.cpp
   lapack/roclapack_getf2.cpp
   lapack/roclapack_getrf.cpp
+  lapack/roclapack_getrs.cpp
   lapack/roclapack_potf2.cpp
 )
 
@@ -40,15 +41,14 @@ prepend_path( ".." rocsolver_headers_public relative_rocsolver_headers_public )
 
 add_library( rocsolver
   ${rocsolver_lapack_source}
-  ${rocsolver_blas3_source}
-  ${rocsolver_blas2_source}
-  ${rocsolver_blas1_source}
   ${relative_rocsolver_headers_public}
   ${rocsolver_auxiliary_source}
 )
 
 add_library( roc::rocsolver ALIAS rocsolver )
 
+rocm_clang_tidy_check(rocsolver)
+
 target_link_libraries( rocsolver PRIVATE hip::hip_hcc hip::hip_device hcc::hccshared )
 target_link_libraries( rocsolver PRIVATE /opt/rocm/rocblas/lib/librocblas.so ) #${ROCBLAS_LIBRARY})
 
diff --git a/library/src/lapack/roclapack_laswp.hpp b/library/src/lapack/roclapack_laswp.hpp
index bcc551c5f..e7b0fedd8 100644
--- a/library/src/lapack/roclapack_laswp.hpp
+++ b/library/src/lapack/roclapack_laswp.hpp
@@ -15,10 +15,11 @@
 using namespace std;
 
 template <typename T>
-__global__ void laswp_external(rocblas_int n, T *a, rocblas_int lda,
-                               rocblas_int exch1, rocblas_int *exch2) {
+__global__ void laswp_external(const rocblas_int n, T *a, const rocblas_int lda,
+                               const rocblas_int exch1,
+                               const rocblas_int *exch2) {
 
-  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+  const int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
   if (tid < n) {
     const T orig = a[exch1 + lda * tid];
     a[exch1 + lda * tid] = a[(*exch2 - 1) + lda * tid];
@@ -74,7 +75,7 @@ __global__ void laswp_external(rocblas_int n, T *a, rocblas_int lda,
 template <typename T>
 void roclapack_laswp_template(rocblas_handle handle, rocblas_int n, T *A,
                               rocblas_int lda, rocblas_int k1, rocblas_int k2,
-                              rocblas_int *ipiv, rocblas_int incx) {
+                              const rocblas_int *ipiv, rocblas_int incx) {
 
   if (n == 0) {
     // quick return
@@ -83,8 +84,8 @@ void roclapack_laswp_template(rocblas_handle handle, rocblas_int n, T *A,
 
   rocblas_int start, end;
   if (incx < 0) {
-    start = k2;
-    end = k1;
+    start = k2 - 1;
+    end = k1 - 1;
   } else {
     start = k1;
     end = k2;
@@ -99,13 +100,14 @@ void roclapack_laswp_template(rocblas_handle handle, rocblas_int n, T *A,
    * of ipiv to the host and check there. this causes the current limitation
    * that incx == 1
    */
-  if (incx != 1) {
+  if (incx != 1 && incx != -1) {
     throw runtime_error("roclapack_laswp increment must be one.");
   }
 
-  vector<int> ipivHost(end - start);
-  hipMemcpy(ipivHost.data(), &ipiv[start], sizeof(rocblas_int) * (end - start),
-            hipMemcpyDeviceToHost);
+  vector<int> ipivHost(abs(end - start));
+  size_t startCpy = (incx < 0) ? end : start;
+  hipMemcpy(ipivHost.data(), &ipiv[startCpy],
+            sizeof(rocblas_int) * abs(end - start), hipMemcpyDeviceToHost);
 
   rocblas_int blocksPivot = (n - 1) / LASWP_BLOCKSIZE + 1;
   dim3 gridPivot(blocksPivot, 1, 1);
@@ -116,7 +118,7 @@ void roclapack_laswp_template(rocblas_handle handle, rocblas_int n, T *A,
 
   for (rocblas_int i = start; i != end; i += incx) {
 
-    if (ipivHost[i - start] == i + 1)
+    if (ipivHost[i - startCpy] == i + 1)
       continue;
 
     hipLaunchKernelGGL(laswp_external<T>, gridPivot, threads, 0, stream, n, A,