From f7ee9ec910efe364168b6795ad20ec407074bf48 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 24 Sep 2024 17:42:24 +0200 Subject: [PATCH] add checked_lookup parameter --- common/cuda_hip/factorization/lu_kernels.cpp | 38 +++++++++++++------- core/factorization/lu.cpp | 8 ++--- core/factorization/lu_kernels.hpp | 2 +- dpcpp/factorization/lu_kernels.dp.cpp | 2 +- include/ginkgo/core/factorization/lu.hpp | 9 +++++ omp/factorization/lu_kernels.cpp | 13 +++++-- reference/factorization/lu_kernels.cpp | 13 +++++-- 7 files changed, 60 insertions(+), 25 deletions(-) diff --git a/common/cuda_hip/factorization/lu_kernels.cpp b/common/cuda_hip/factorization/lu_kernels.cpp index d960574838e..6cb9b02129b 100644 --- a/common/cuda_hip/factorization/lu_kernels.cpp +++ b/common/cuda_hip/factorization/lu_kernels.cpp @@ -85,7 +85,7 @@ __global__ __launch_bounds__(default_block_size) void initialize( } -template +template __global__ __launch_bounds__(default_block_size) void factorize( const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ cols, const IndexType* __restrict__ storage_offsets, @@ -130,12 +130,14 @@ __global__ __launch_bounds__(default_block_size) void factorize( upper_nz += config::warp_size) { const auto upper_col = cols[upper_nz]; const auto upper_val = vals[upper_nz]; - // const auto output_pos = lookup[upper_col]; - const auto output_pos = lookup.lookup_unsafe(upper_col) + row_begin; - if (output_pos >= row_begin && output_pos < row_end && - cols[output_pos] == upper_col) { - // if (output_pos != invalid_index()) { - // output_pos += row_begin; + if (checked_lookup) { + const auto pos = lookup[upper_col]; + if (pos != invalid_index()) { + vals[row_begin + pos] -= scale * upper_val; + } + } else { + const auto output_pos = + lookup.lookup_unsafe(upper_col) + row_begin; vals[output_pos] -= scale * upper_val; } } @@ -258,7 +260,7 @@ template void factorize(std::shared_ptr exec, const IndexType* lookup_offsets, const int64* lookup_descs, const int32* lookup_storage, const IndexType* diag_idxs, - matrix::Csr* factors, + matrix::Csr* factors, bool checked_lookup, array& tmp_storage) { const auto num_rows = factors->get_size()[0]; @@ -266,11 +268,21 @@ void factorize(std::shared_ptr exec, syncfree_storage storage(exec, tmp_storage, num_rows); const auto num_blocks = ceildiv(num_rows, default_block_size / config::warp_size); - kernel::factorize<<get_stream()>>>( - factors->get_const_row_ptrs(), factors->get_const_col_idxs(), - lookup_offsets, lookup_storage, lookup_descs, diag_idxs, - as_device_type(factors->get_values()), storage, num_rows); + if (checked_lookup) { + kernel::factorize + <<get_stream()>>>( + factors->get_const_row_ptrs(), + factors->get_const_col_idxs(), lookup_offsets, + lookup_storage, lookup_descs, diag_idxs, + as_device_type(factors->get_values()), storage, num_rows); + } else { + kernel::factorize + <<get_stream()>>>( + factors->get_const_row_ptrs(), + factors->get_const_col_idxs(), lookup_offsets, + lookup_storage, lookup_descs, diag_idxs, + as_device_type(factors->get_values()), storage, num_rows); + } } } diff --git a/core/factorization/lu.cpp b/core/factorization/lu.cpp index 8c155729810..36603c068f6 100644 --- a/core/factorization/lu.cpp +++ b/core/factorization/lu.cpp @@ -165,10 +165,10 @@ std::unique_ptr Lu::generate_impl( storage.get_const_data(), diag_idxs.get_data(), factors.get())); // run numerical factorization array tmp{exec}; - exec->run(make_factorize(storage_offsets.get_const_data(), - row_descs.get_const_data(), - storage.get_const_data(), - diag_idxs.get_const_data(), factors.get(), tmp)); + exec->run(make_factorize( + storage_offsets.get_const_data(), row_descs.get_const_data(), + storage.get_const_data(), diag_idxs.get_const_data(), factors.get(), + parameters_.checked_lookup, tmp)); return factorization_type::create_from_combined_lu(std::move(factors)); } diff --git a/core/factorization/lu_kernels.hpp b/core/factorization/lu_kernels.hpp index f497398cb90..102f9eaaa52 100644 --- a/core/factorization/lu_kernels.hpp +++ b/core/factorization/lu_kernels.hpp @@ -33,7 +33,7 @@ namespace kernels { const IndexType* lookup_offsets, const int64* lookup_descs, \ const int32* lookup_storage, const IndexType* diag_idxs, \ matrix::Csr* factors, \ - array& tmp_storage) + bool checked_lookup array& tmp_storage) #define GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE(IndexType) \ diff --git a/dpcpp/factorization/lu_kernels.dp.cpp b/dpcpp/factorization/lu_kernels.dp.cpp index a891b5b7b2f..d6a1c2ed5b2 100644 --- a/dpcpp/factorization/lu_kernels.dp.cpp +++ b/dpcpp/factorization/lu_kernels.dp.cpp @@ -39,7 +39,7 @@ template void factorize(std::shared_ptr exec, const IndexType* lookup_offsets, const int64* lookup_descs, const int32* lookup_storage, const IndexType* diag_idxs, - matrix::Csr* factors, + matrix::Csr* factors, bool checked_lookup, array& tmp_storage) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE); diff --git a/include/ginkgo/core/factorization/lu.hpp b/include/ginkgo/core/factorization/lu.hpp index 5cb469b4d47..b0edaa07847 100644 --- a/include/ginkgo/core/factorization/lu.hpp +++ b/include/ginkgo/core/factorization/lu.hpp @@ -101,6 +101,15 @@ class Lu * incorrect results or crash. */ bool GKO_FACTORY_PARAMETER_SCALAR(skip_sorting, false); + + /** + * The symbolic factoization should contains the fill-in information. If + * it is not the case, users might face hang or illegal access issue. + * Please enable this option when the symbolic factorization does not + * contain the full fill-in information. Symbolic factorization must + * still contain the entry for the original matrix. + */ + bool GKO_FACTORY_PARAMETER_SCALAR(checked_lookup, false); }; /** diff --git a/omp/factorization/lu_kernels.cpp b/omp/factorization/lu_kernels.cpp index 53847ff2b6c..c17d8cdfc9a 100644 --- a/omp/factorization/lu_kernels.cpp +++ b/omp/factorization/lu_kernels.cpp @@ -66,7 +66,7 @@ template void factorize(std::shared_ptr exec, const IndexType* lookup_offsets, const int64* lookup_descs, const int32* lookup_storage, const IndexType* diag_idxs, - matrix::Csr* factors, + matrix::Csr* factors, bool checked_lookup, array& tmp_storage) { const auto num_rows = factors->get_size()[0]; @@ -89,8 +89,15 @@ void factorize(std::shared_ptr exec, for (auto dep_nz = dep_diag_idx + 1; dep_nz < dep_end; dep_nz++) { const auto col = cols[dep_nz]; const auto val = vals[dep_nz]; - const auto nz = row_begin + lookup.lookup_unsafe(col); - vals[nz] -= scale * val; + if (checked_lookup) { + const auto idx = lookup[col]; + if (idx != invalid_index()) { + vals[row_begin + idx] -= scale * val; + } + } else { + const auto nz = row_begin + lookup.lookup_unsafe(col); + vals[nz] -= scale * val; + } } } } diff --git a/reference/factorization/lu_kernels.cpp b/reference/factorization/lu_kernels.cpp index d8516cffb49..2fc0a1b5d1b 100644 --- a/reference/factorization/lu_kernels.cpp +++ b/reference/factorization/lu_kernels.cpp @@ -65,7 +65,7 @@ template void factorize(std::shared_ptr exec, const IndexType* lookup_offsets, const int64* lookup_descs, const int32* lookup_storage, const IndexType* diag_idxs, - matrix::Csr* factors, + matrix::Csr* factors, bool checked_lookup, array& tmp_storage) { const auto num_rows = factors->get_size()[0]; @@ -87,8 +87,15 @@ void factorize(std::shared_ptr exec, for (auto dep_nz = dep_diag_idx + 1; dep_nz < dep_end; dep_nz++) { const auto col = cols[dep_nz]; const auto val = vals[dep_nz]; - const auto nz = row_begin + lookup.lookup_unsafe(col); - vals[nz] -= scale * val; + if (checked_lookup) { + const auto idx = lookup[col]; + if (idx != invalid_index()) { + vals[row_begin + idx] -= scale * val; + } + } else { + const auto nz = row_begin + lookup.lookup_unsafe(col); + vals[nz] -= scale * val; + } } } }