From c13dac2e77cd687b5b813f3ff7f304da68abeb9e Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 9 Sep 2024 16:42:57 +0200 Subject: [PATCH 1/2] add the test to detect the problem --- test/solver/bicgstab_kernels.cpp | 42 ++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/test/solver/bicgstab_kernels.cpp b/test/solver/bicgstab_kernels.cpp index a90451a3f3a..9716acd86cb 100644 --- a/test/solver/bicgstab_kernels.cpp +++ b/test/solver/bicgstab_kernels.cpp @@ -245,6 +245,48 @@ TEST_F(Bicgstab, BicgstabStep3IsEquivalentToRef) } +TEST_F(Bicgstab, BicgstabFinalizeIsEquivalentToRefWithoutRaceCondition) +{ + /** + * This test is designed to detect the following problem. Originally, we + * assigned threads per value to update the value and the stop status if the + * stop status is stopped but not finished yet. However, it leads to race + * conditions. If all threads see stop status before the update, all values + * will be correctly updated. It is also possible that some threads already + * finalize the stop status, but the rest see the stop status as finalized + * such that they will not update the value. We make this test case large to + * trigger this race condition more easily. However, it is not guaranteed to + * fail with the old version because of race conditions. + */ + int m = 1e6; + int n = 2; + x = gen_mtx(m, n, n); + y = gen_mtx(m, n, n); + alpha = gen_mtx(1, n, n); + d_x = x->clone(exec); + d_y = y->clone(exec); + d_alpha = alpha->clone(exec); + stop_status = std::make_unique>(ref, n); + for (size_t i = 0; i < n; ++i) { + stop_status->get_data()[i].reset(); + } + // check correct handling for stopped columns + stop_status->get_data()[1].stop(1); + // finalize only update the stopped one but not finished yet + stop_status->get_data()[0].stop(1, false); + d_stop_status = + std::make_unique>(exec, *stop_status); + + gko::kernels::reference::bicgstab::finalize(ref, x.get(), y.get(), + alpha.get(), stop_status.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::finalize( + exec, d_x.get(), d_y.get(), d_alpha.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(d_x, x, ::r::value); + GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status); +} + + TEST_F(Bicgstab, BicgstabApplyOneRHSIsEquivalentToRef) { int m = 123; From f57bb12b6eee5ba6279b67fdd8695db35de9d941 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 5 Sep 2024 18:35:12 +0200 Subject: [PATCH 2/2] fix the race condition --- common/unified/solver/bicgstab_kernels.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/common/unified/solver/bicgstab_kernels.cpp b/common/unified/solver/bicgstab_kernels.cpp index b696815f0d4..c403da3bf96 100644 --- a/common/unified/solver/bicgstab_kernels.cpp +++ b/common/unified/solver/bicgstab_kernels.cpp @@ -174,11 +174,18 @@ void finalize(std::shared_ptr exec, auto stop) { if (stop[col].has_stopped() && !stop[col].is_finalized()) { x(row, col) += alpha[col] * y(row, col); - stop[col].finalize(); } }, x->get_size(), y->get_stride(), x, default_stride(y), row_vector(alpha), *stop_status); + run_kernel( + exec, + [] GKO_KERNEL(auto col, auto stop) { + if (stop[col].has_stopped() && !stop[col].is_finalized()) { + stop[col].finalize(); + } + }, + x->get_size()[1], *stop_status); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL);