Skip to content

Commit

Permalink
Second round of optimizations for LU (GETRF) (ROCm#202)
Browse files Browse the repository at this point in the history
* new block sizes
* fix no pivot cases
* changelog
* remove rocblas_initialize from bench client
  • Loading branch information
jzuniga-amd authored Jan 14, 2021
1 parent f5a215c commit dd3500e
Show file tree
Hide file tree
Showing 11 changed files with 302 additions and 209 deletions.
32 changes: 11 additions & 21 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,7 @@

Full documentation for rocSOLVER is available at [rocsolver.readthedocs.io](https://rocsolver.readthedocs.io/en/latest/).

## [(Unreleased) rocSOLVER for ROCm 4.1.0]
### Added
- Sample code and unit test for unified memory model/Heterogeneous Memory Management (HMM)

### Optimizations

### Changed

### Deprecated

### Removed

### Fixed
- Fixed runtime errors in debug mode caused by incorrect kernel launch bounds
- Fixed complex unit test bug caused by incorrect zaxpy function signature
- Eliminated a small memory transfer that was being done on the default stream
- Fixed GESVD right singular vectors for 1x1 matrices



## [(Unreleased) rocSOLVER 3.11.0 for ROCm 4.0.0]
## [(Unreleased) rocSOLVER 3.11.0 for ROCm 4.1.0]
### Added
- Eigensolver routines for symmetric/hermitian matrices:
- STERF, STEQR
Expand All @@ -36,15 +16,25 @@ Full documentation for rocSOLVER is available at [rocsolver.readthedocs.io](http
- LATRD
- SYTD2, SYTRD (with batched and strided\_batched versions)
- HETD2, HETRD (with batched and strided\_batched versions)
- Sample code and unit test for unified memory model/Heterogeneous Memory Management (HMM)

### Optimizations
- Improved performance of LU factorization of small and mid-size matrices (n <= 2048)

### Changed
- Raised minimum requirement for building rocSOLVER from source to CMake 3.8
- Switched to use semantic versioning for the library
- Enabled automatic reallocation of memory workspace in rocsolver clients

### Removed
- Removed `-DOPTIMAL` from the `roc::rocsolver` CMake usage requirements. This is an internal
rocSOLVER definition, and does not need to be defined by library users

### Fixed
- Fixed runtime errors in debug mode caused by incorrect kernel launch bounds
- Fixed complex unit test bug caused by incorrect zaxpy function signature
- Eliminated a small memory transfer that was being done on the default stream
- Fixed GESVD right singular vectors for 1x1 matrices


## [rocSOLVER 3.10.0 for ROCm 3.10.0]
Expand Down
4 changes: 1 addition & 3 deletions rocsolver/clients/benchmarks/client.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* ************************************************************************
* Copyright (c) 2016-2020 Advanced Micro Devices, Inc.
* Copyright (c) 2016-2021 Advanced Micro Devices, Inc.
* ************************************************************************ */

#include "testing_bdsqr.hpp"
Expand Down Expand Up @@ -42,8 +42,6 @@ namespace po = boost::program_options;
int main(int argc, char* argv[])
try
{
rocblas_initialize();

Arguments argus;

// disable unit_check in client benchmark, it is only
Expand Down
18 changes: 14 additions & 4 deletions rocsolver/library/src/include/ideal_sizes.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* ************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc.
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc.
* ************************************************************************ */

#pragma once
Expand All @@ -23,13 +23,23 @@
#define ORMxx_ORMxx_BLOCKSIZE 32

// getf2/getfr
#define GETRF_GETF2_SWITCHSIZE 64
#define GETF2_MAX_THDS 256
#define GETRF_GETF2_BLOCKSIZE 64
#define GETF2_OPTIM_NGRP \
16, 15, 8, 8, 8, 8, 8, 8, 6, 6, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
#define GETF2_BATCH_OPTIM_MAX_SIZE 2048
#define GETF2_BATCH_OPTIM_MAX_SIZE 1024
#define GETF2_OPTIM_MAX_SIZE 1024
#define GETRF_NUM_INTERVALS_NORMAL 4
#define GETRF_INTERVALS_NORMAL 65, 657, 1217, 5249
#define GETRF_BLKSIZES_NORMAL 1, 32, 1, 128, 192
#define GETRF_NUM_INTERVALS_BATCH 3
#define GETRF_INTERVALS_BATCH 65, 497, 2049
#define GETRF_BLKSIZES_BATCH 1, 16, 32, 64
#define GETRF_NPVT_NUM_INTERVALS_NORMAL 3
#define GETRF_NPVT_INTERVALS_NORMAL 65, 3073, 4609
#define GETRF_NPVT_BLKSIZES_NORMAL 1, 32, 64, 192
#define GETRF_NPVT_NUM_INTERVALS_BATCH 3
#define GETRF_NPVT_INTERVALS_BATCH 45, 181, 2049
#define GETRF_NPVT_BLKSIZES_BATCH 1, 16, 32, 64

// getri
#define GETRI_SWITCHSIZE_MID 64
Expand Down
29 changes: 14 additions & 15 deletions rocsolver/library/src/lapack/roclapack_getf2.cpp
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
/* ************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc.
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc.
* ************************************************************************ */

#include "roclapack_getf2.hpp"

template <typename T, typename U>
template <bool PIVOT, typename T, typename U>
rocblas_status rocsolver_getf2_impl(rocblas_handle handle,
const rocblas_int m,
const rocblas_int n,
U A,
const rocblas_int lda,
rocblas_int* ipiv,
rocblas_int* info,
const int pivot)
rocblas_int* info)
{
using S = decltype(std::real(T{}));

Expand All @@ -22,7 +21,7 @@ rocblas_status rocsolver_getf2_impl(rocblas_handle handle,
// logging is missing ???

// argument checking
rocblas_status st = rocsolver_getf2_getrf_argCheck(handle, m, n, lda, A, ipiv, info, pivot);
rocblas_status st = rocsolver_getf2_getrf_argCheck(handle, m, n, lda, A, ipiv, info, PIVOT);
if(st != rocblas_status_continue)
return st;

Expand Down Expand Up @@ -65,8 +64,8 @@ rocblas_status rocsolver_getf2_impl(rocblas_handle handle,
init_scalars(handle, (T*)scalars);

// execution
return rocsolver_getf2_template<false, T, S>(
handle, m, n, A, shiftA, lda, strideA, ipiv, shiftP, strideP, info, batch_count, pivot,
return rocsolver_getf2_template<false, PIVOT, T, S>(
handle, m, n, A, shiftA, lda, strideA, ipiv, shiftP, strideP, info, batch_count,
(T*)scalars, (rocblas_index_value_t<S>*)work, (T*)pivotval, (rocblas_int*)pivotidx);
}

Expand All @@ -86,7 +85,7 @@ rocblas_status rocsolver_sgetf2(rocblas_handle handle,
rocblas_int* ipiv,
rocblas_int* info)
{
return rocsolver_getf2_impl<float>(handle, m, n, A, lda, ipiv, info, 1);
return rocsolver_getf2_impl<true, float>(handle, m, n, A, lda, ipiv, info);
}

rocblas_status rocsolver_dgetf2(rocblas_handle handle,
Expand All @@ -97,7 +96,7 @@ rocblas_status rocsolver_dgetf2(rocblas_handle handle,
rocblas_int* ipiv,
rocblas_int* info)
{
return rocsolver_getf2_impl<double>(handle, m, n, A, lda, ipiv, info, 1);
return rocsolver_getf2_impl<true, double>(handle, m, n, A, lda, ipiv, info);
}

rocblas_status rocsolver_cgetf2(rocblas_handle handle,
Expand All @@ -108,7 +107,7 @@ rocblas_status rocsolver_cgetf2(rocblas_handle handle,
rocblas_int* ipiv,
rocblas_int* info)
{
return rocsolver_getf2_impl<rocblas_float_complex>(handle, m, n, A, lda, ipiv, info, 1);
return rocsolver_getf2_impl<true, rocblas_float_complex>(handle, m, n, A, lda, ipiv, info);
}

rocblas_status rocsolver_zgetf2(rocblas_handle handle,
Expand All @@ -119,7 +118,7 @@ rocblas_status rocsolver_zgetf2(rocblas_handle handle,
rocblas_int* ipiv,
rocblas_int* info)
{
return rocsolver_getf2_impl<rocblas_double_complex>(handle, m, n, A, lda, ipiv, info, 1);
return rocsolver_getf2_impl<true, rocblas_double_complex>(handle, m, n, A, lda, ipiv, info);
}

rocblas_status rocsolver_sgetf2_npvt(rocblas_handle handle,
Expand All @@ -130,7 +129,7 @@ rocblas_status rocsolver_sgetf2_npvt(rocblas_handle handle,
rocblas_int* info)
{
rocblas_int* ipiv = nullptr;
return rocsolver_getf2_impl<float>(handle, m, n, A, lda, ipiv, info, 0);
return rocsolver_getf2_impl<false, float>(handle, m, n, A, lda, ipiv, info);
}

rocblas_status rocsolver_dgetf2_npvt(rocblas_handle handle,
Expand All @@ -141,7 +140,7 @@ rocblas_status rocsolver_dgetf2_npvt(rocblas_handle handle,
rocblas_int* info)
{
rocblas_int* ipiv = nullptr;
return rocsolver_getf2_impl<double>(handle, m, n, A, lda, ipiv, info, 0);
return rocsolver_getf2_impl<false, double>(handle, m, n, A, lda, ipiv, info);
}

rocblas_status rocsolver_cgetf2_npvt(rocblas_handle handle,
Expand All @@ -152,7 +151,7 @@ rocblas_status rocsolver_cgetf2_npvt(rocblas_handle handle,
rocblas_int* info)
{
rocblas_int* ipiv = nullptr;
return rocsolver_getf2_impl<rocblas_float_complex>(handle, m, n, A, lda, ipiv, info, 0);
return rocsolver_getf2_impl<false, rocblas_float_complex>(handle, m, n, A, lda, ipiv, info);
}

rocblas_status rocsolver_zgetf2_npvt(rocblas_handle handle,
Expand All @@ -163,7 +162,7 @@ rocblas_status rocsolver_zgetf2_npvt(rocblas_handle handle,
rocblas_int* info)
{
rocblas_int* ipiv = nullptr;
return rocsolver_getf2_impl<rocblas_double_complex>(handle, m, n, A, lda, ipiv, info, 0);
return rocsolver_getf2_impl<false, rocblas_double_complex>(handle, m, n, A, lda, ipiv, info);
}

} // extern C
Loading

0 comments on commit dd3500e

Please sign in to comment.