Skip to content
This repository has been archived by the owner on Jan 13, 2025. It is now read-only.

fix for DEFAULT TUNING_TARGET on AMD and NVIDIA GPUs #517

Merged
merged 11 commits into from
May 20, 2024
58 changes: 48 additions & 10 deletions src/interface/blas2/backend/default.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,21 @@ typename sb_handle_t::event_t _trsv(
sb_handle_t& sb_handle, index_t _N, container_t0 _mA, index_t _lda,
container_t1 _vx, increment_t _incx,
typename sb_handle_t::event_t _dependencies) {
return blas::internal::_trsv_impl<4, 2, uplo, trn, diag>(
sb_handle, _N, _mA, _lda, _vx, _incx, _dependencies);
const auto device = sb_handle.get_queue().get_device();
if (device.is_gpu()) {
const std::string vendor =
device.template get_info<cl::sycl::info::device::vendor>();
if (vendor.find("Intel") == vendor.npos) {
return blas::internal::_trsv_impl<32, 4, uplo, trn, diag>(
hjabird marked this conversation as resolved.
Show resolved Hide resolved
sb_handle, _N, _mA, _lda, _vx, _incx, _dependencies);
} else {
throw std::runtime_error(
"Trsv operator currently not supported on Intel GPUs");
}
} else {
return blas::internal::_trsv_impl<4, 2, uplo, trn, diag>(
sb_handle, _N, _mA, _lda, _vx, _incx, _dependencies);
}
}
} // namespace backend
} // namespace trsv
Expand All @@ -152,8 +165,21 @@ typename sb_handle_t::event_t _tbsv(
sb_handle_t& sb_handle, index_t _N, index_t _K, container_t0 _mA,
index_t _lda, container_t1 _vx, increment_t _incx,
const typename sb_handle_t::event_t& _dependencies) {
return blas::internal::_tbsv_impl<4, 2, uplo, trn, diag>(
sb_handle, _N, _K, _mA, _lda, _vx, _incx, _dependencies);
const auto device = sb_handle.get_queue().get_device();
if (device.is_gpu()) {
const std::string vendor =
device.template get_info<cl::sycl::info::device::vendor>();
if (vendor.find("Intel") == vendor.npos) {
return blas::internal::_tbsv_impl<32, 4, uplo, trn, diag>(
sb_handle, _N, _K, _mA, _lda, _vx, _incx, _dependencies);
} else {
throw std::runtime_error(
"Tbsv operator currently not supported on Intel GPUs");
}
} else {
return blas::internal::_tbsv_impl<4, 2, uplo, trn, diag>(
sb_handle, _N, _K, _mA, _lda, _vx, _incx, _dependencies);
}
}
} // namespace backend
} // namespace tbsv
Expand All @@ -163,12 +189,24 @@ namespace backend {
template <uplo_type uplo, transpose_type trn, diag_type diag,
typename sb_handle_t, typename index_t, typename container_t0,
typename container_t1, typename increment_t>
typename sb_handle_t::event_t _tpsv(sb_handle_t& sb_handle, index_t _N,
container_t0 _mA, container_t1 _vx,
increment_t _incx,
const typename sb_handle_t::event_t& _dependencies) {
return blas::internal::_tpsv_impl<4, 2, uplo, trn, diag>(sb_handle, _N, _mA,
_vx, _incx, _dependencies);
typename sb_handle_t::event_t _tpsv(
sb_handle_t& sb_handle, index_t _N, container_t0 _mA, container_t1 _vx,
increment_t _incx, const typename sb_handle_t::event_t& _dependencies) {
hjabird marked this conversation as resolved.
Show resolved Hide resolved
const auto device = sb_handle.get_queue().get_device();
if (device.is_gpu()) {
const std::string vendor =
device.template get_info<cl::sycl::info::device::vendor>();
if (vendor.find("Intel") == vendor.npos) {
return blas::internal::_tpsv_impl<32, 4, uplo, trn, diag>(
sb_handle, _N, _mA, _vx, _incx, _dependencies);
} else {
throw std::runtime_error(
"Tpsv operator currently not supported on Intel GPUs");
}
} else {
return blas::internal::_tpsv_impl<4, 2, uplo, trn, diag>(
sb_handle, _N, _mA, _vx, _incx, _dependencies);
}
}
} // namespace backend
} // namespace tpsv
Expand Down
18 changes: 5 additions & 13 deletions src/operations/blas1/IndexMaxMin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,22 +94,14 @@ PORTBLAS_INLINE void IndexMaxMin<is_max, is_step0, lhs_t, rhs_t>::eval(
using element_t =
typename ResolveReturnType<op, rhs_t>::type::value_t::value_t;

#ifndef __ADAPTIVECPP__
// reduction within the sub_group
for (index_t i = sg_local_range >> 1; i > 0; i >>= 1) {
if (sg_local_id < i) {
element_t shfl_val = sycl::shift_group_left(sg, val.get_value(), i);
index_t shfl_idx = sycl::shift_group_left(sg, val.get_index(), i);
value_t shfl{shfl_idx, shfl_val};
val = op::eval(val, shfl);
}
element_t shfl_val = cl::sycl::shift_group_left(sg, val.get_value(), i);
index_t shfl_idx = cl::sycl::shift_group_left(sg, val.get_index(), i);
value_t shfl{shfl_idx, shfl_val};
val = op::eval(val, shfl);
hjabird marked this conversation as resolved.
Show resolved Hide resolved
Rbiessy marked this conversation as resolved.
Show resolved Hide resolved
}
#else
// AdaptiveCpp uses a different interface "shift_group_left" which is
// recognized by the compiler but throws JIT errors at runtime. Currently this
// part is skipped as non-local memory kernel is never called with
// AdaptiveCpp.
#endif

const index_t lhs_idx =
ndItem.get_group_linear_id() * (local_range / sg_local_range) +
sg.get_group_linear_id();
Expand Down
17 changes: 1 addition & 16 deletions test/unittest/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,25 +106,10 @@ endif()
if(is_dpcpp AND ${TUNING_TARGET} STREQUAL "DEFAULT")
if (${DPCPP_SYCL_TARGET} MATCHES "nvidia")
set(TESTS_TO_SKIP
${PORTBLAS_UNITTEST}/blas1/blas1_iamax_test.cpp
${PORTBLAS_UNITTEST}/blas1/blas1_iamin_test.cpp
${PORTBLAS_UNITTEST}/blas2/blas2_tbsv_test.cpp
${PORTBLAS_UNITTEST}/blas2/blas2_tpsv_test.cpp
${PORTBLAS_UNITTEST}/blas2/blas2_trsv_test.cpp
${PORTBLAS_UNITTEST}/blas3/blas3_trsm_test.cpp
)
message(WARNING "Targetting NVIDIA hardware with DEFAULT TUNING_TARGET.
Disabling tests for following operators: iamax, iamin, trsv, tbsv, tpsv, trsm.")
elseif (${DPCPP_SYCL_TARGET} MATCHES "amd")
set(TESTS_TO_SKIP
${PORTBLAS_UNITTEST}/blas1/blas1_iamax_test.cpp
${PORTBLAS_UNITTEST}/blas1/blas1_iamin_test.cpp
${PORTBLAS_UNITTEST}/blas2/blas2_tbsv_test.cpp
${PORTBLAS_UNITTEST}/blas2/blas2_tpsv_test.cpp
${PORTBLAS_UNITTEST}/blas2/blas2_trsv_test.cpp
)
message(WARNING "Targetting AMD hardware with DEFAULT TUNING_TARGET.
Disabling tests for following operators: iamax, iamin, tbsv, tpsv, trsv.")
Disabling tests for following operator: trsm.")
endif()
endif()

Expand Down
2 changes: 1 addition & 1 deletion test/unittest/blas1/blas1_iaminmax_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ const auto combi = ::testing::Combine(
::testing::Values("usm", "buf"), // allocation type
::testing::Values(api_type::async, api_type::sync), // Api
::testing::Values(11, 65, 1000000), // size
::testing::Values(-1, 5), // incX
::testing::Values(1, -1, 5), // incX
::testing::Values(generation_mode_t::Random, generation_mode_t::Limit,
generation_mode_t::Incrementing,
generation_mode_t::Decrementing),
Expand Down
Loading