Skip to content

Commit

Permalink
Merge #1676 Fix the race condition of bicgstab
Browse files Browse the repository at this point in the history
This PR fixes the race condition of bicgstab which is found in #1563

Related PR: #1676
  • Loading branch information
yhmtsai authored Sep 10, 2024
2 parents 2a551be + f57bb12 commit 6db98d6
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 1 deletion.
9 changes: 8 additions & 1 deletion common/unified/solver/bicgstab_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,11 +174,18 @@ void finalize(std::shared_ptr<const DefaultExecutor> exec,
auto stop) {
if (stop[col].has_stopped() && !stop[col].is_finalized()) {
x(row, col) += alpha[col] * y(row, col);
stop[col].finalize();
}
},
x->get_size(), y->get_stride(), x, default_stride(y), row_vector(alpha),
*stop_status);
run_kernel(
exec,
[] GKO_KERNEL(auto col, auto stop) {
if (stop[col].has_stopped() && !stop[col].is_finalized()) {
stop[col].finalize();
}
},
x->get_size()[1], *stop_status);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_FINALIZE_KERNEL);
Expand Down
42 changes: 42 additions & 0 deletions test/solver/bicgstab_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,48 @@ TEST_F(Bicgstab, BicgstabStep3IsEquivalentToRef)
}


TEST_F(Bicgstab, BicgstabFinalizeIsEquivalentToRefWithoutRaceCondition)
{
/**
* This test is designed to detect the following problem. Originally, we
* assigned threads per value to update the value and the stop status if the
* stop status is stopped but not finished yet. However, it leads to race
* conditions. If all threads see stop status before the update, all values
* will be correctly updated. It is also possible that some threads already
* finalize the stop status, but the rest see the stop status as finalized
* such that they will not update the value. We make this test case large to
* trigger this race condition more easily. However, it is not guaranteed to
* fail with the old version because of race conditions.
*/
int m = 1e6;
int n = 2;
x = gen_mtx(m, n, n);
y = gen_mtx(m, n, n);
alpha = gen_mtx(1, n, n);
d_x = x->clone(exec);
d_y = y->clone(exec);
d_alpha = alpha->clone(exec);
stop_status = std::make_unique<gko::array<gko::stopping_status>>(ref, n);
for (size_t i = 0; i < n; ++i) {
stop_status->get_data()[i].reset();
}
// check correct handling for stopped columns
stop_status->get_data()[1].stop(1);
// finalize only update the stopped one but not finished yet
stop_status->get_data()[0].stop(1, false);
d_stop_status =
std::make_unique<gko::array<gko::stopping_status>>(exec, *stop_status);

gko::kernels::reference::bicgstab::finalize(ref, x.get(), y.get(),
alpha.get(), stop_status.get());
gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::finalize(
exec, d_x.get(), d_y.get(), d_alpha.get(), d_stop_status.get());

GKO_ASSERT_MTX_NEAR(d_x, x, ::r<value_type>::value);
GKO_ASSERT_ARRAY_EQ(*d_stop_status, *stop_status);
}


TEST_F(Bicgstab, BicgstabApplyOneRHSIsEquivalentToRef)
{
int m = 123;
Expand Down

0 comments on commit 6db98d6

Please sign in to comment.