Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add parallel gmres implementation #662

Draft
wants to merge 25 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,7 @@ if (ASGARD_BUILD_TESTS)
set (mpi_test_components
adapt
distribution
solver
time_advance
asgard_discretization
)
Expand Down
24 changes: 24 additions & 0 deletions src/asgard_matrix.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once
#include "build_info.hpp"

#include "asgard_mpi.h"
#include "asgard_resources.hpp"
#include "lib_dispatch.hpp"
#include "tools.hpp"
Expand All @@ -11,6 +12,8 @@
#include <string>
#include <vector>

#include <unistd.h>

namespace asgard
{
// resource arguments allow developers to select host (CPU only) or device
Expand Down Expand Up @@ -1203,6 +1206,24 @@ template<typename P, mem_type mem, resource resrc>
template<resource, typename>
void fk::matrix<P, mem, resrc>::print(std::string label) const
{
#ifdef ASGARD_USE_MPI
static int const rank = []() {
int my_rank;
auto const status =
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
expect(status == 0);
return my_rank;
}();
static int const num_ranks = []() {
int output;
auto const status = MPI_Comm_size(MPI_COMM_WORLD, &output);
expect(status == 0);
return output;
}();
sleep(rank);
std::cout << "rank: " << rank << '\n';
#endif

if constexpr (mem == mem_type::owner)
std::cout << label << "(owner)" << '\n';

Expand Down Expand Up @@ -1233,6 +1254,9 @@ void fk::matrix<P, mem, resrc>::print(std::string label) const
}
std::cout << '\n';
}
#ifdef ASGARD_USE_MPI
sleep(num_ranks - rank - 1);
#endif
}

//
Expand Down
4 changes: 2 additions & 2 deletions src/asgard_resources.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
namespace asgard
{
// used to suppress warnings in unused variables
auto const ignore = [](auto ignored) { (void)ignored; };
auto const ignore = [](auto const &ignored) { (void)ignored; };

/*!
* \brief Default precision to use, double if enabled and float otherwise.
Expand Down Expand Up @@ -115,4 +115,4 @@ void memcpy_2d(P *dest, int const dest_stride, P const *const source,
#include "asgard_resources_cuda.tpp"
#else
#include "asgard_resources_host.tpp"
#endif
#endif
25 changes: 25 additions & 0 deletions src/asgard_vector.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once
#include "build_info.hpp"

#include "asgard_mpi.h"
#include "asgard_resources.hpp"
#include "lib_dispatch.hpp"
#include "tools.hpp"
Expand All @@ -11,6 +12,8 @@
#include <string>
#include <vector>

#include <unistd.h>

namespace asgard
{
// resource arguments allow developers to select host (CPU only) or device
Expand Down Expand Up @@ -978,6 +981,24 @@ template<typename P, mem_type mem, resource resrc>
template<resource, typename>
void fk::vector<P, mem, resrc>::print(std::string_view const label) const
{
#ifdef ASGARD_USE_MPI
static int const rank = []() {
int my_rank;
auto const status =
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
expect(status == 0);
return my_rank;
}();
static int const num_ranks = []() {
int output;
auto const status = MPI_Comm_size(MPI_COMM_WORLD, &output);
expect(status == 0);
return output;
}();
sleep(rank);
std::cout << "rank: " << rank << '\n';
#endif

if constexpr (mem == mem_type::owner)
std::cout << label << "(owner)" << '\n';
else if constexpr (mem == mem_type::view)
Expand All @@ -999,6 +1020,10 @@ void fk::vector<P, mem, resrc>::print(std::string_view const label) const
std::cout << std::right << (*this)(i) << " ";
}
std::cout << '\n';

#ifdef ASGARD_USE_MPI
sleep(num_ranks - rank - 1);
#endif
}

//
Expand Down
69 changes: 69 additions & 0 deletions src/distribution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,49 @@ void reduce_results(fk::vector<P, src_mem, resrc> const &source,
#endif
}

template<typename P, std::enable_if_t<std::is_floating_point<P>::value, bool>>
void reduce_results(P const source,
P &dest,
distribution_plan const &plan, int const my_rank)
{
expect(my_rank >= 0);
expect(my_rank < static_cast<int>(plan.size()));

#ifdef ASGARD_USE_MPI
if (plan.size() == 1)
{
dest = source;
return;
}

dest = 0.;
int const num_cols = get_num_subgrid_cols(plan.size());

int const my_row = my_rank / num_cols;
int const my_col = my_rank % num_cols;

MPI_Comm row_communicator;
MPI_Comm const global_communicator = distro_handle.get_global_comm();

auto success =
MPI_Comm_split(global_communicator, my_row, my_col, &row_communicator);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The MPI_Comm_split() call causes global communication, see the algorithm description here:

https://www.mpich.org/static/docs/v3.1.3/www3/MPI_Comm_split.html

It's OK to use (and for prototype purposes), but eventually this step should be done only once. You can have a reducer class, where you initialize an object once and you keep the communicator, then you just call MPI_Allreduce() for every iteration.

expect(success == 0);

MPI_Datatype const mpi_type =
std::is_same_v<P, double> ? MPI_DOUBLE : MPI_FLOAT;
success = MPI_Allreduce((void *)&source, (void *)&dest,
1, mpi_type, MPI_SUM, row_communicator);
expect(success == 0);

success = MPI_Comm_free(&row_communicator);
expect(success == 0);

#else
dest = source;
return;
#endif
}

//
// -- below functionality for exchanging solution vector data across subgrid
// rows via point-to-point messages.
Expand Down Expand Up @@ -1170,16 +1213,34 @@ void scatter_matrix(P const *A, int const *descA, P *A_distr,

#ifdef ASGARD_ENABLE_DOUBLE

template void reduce_results(double const source, double &dest,
distribution_plan const &plan, int const my_rank);

template void reduce_results(
fk::vector<double, mem_type::owner, resource::host> const &source,
fk::vector<double, mem_type::owner, resource::host> &dest,
distribution_plan const &plan, int const my_rank);

template void reduce_results(
fk::vector<double, mem_type::owner, resource::host> const &source,
fk::vector<double, mem_type::view, resource::host> &dest,
distribution_plan const &plan, int const my_rank);

template void exchange_results(
fk::vector<double, mem_type::owner, resource::host> const &source,
fk::vector<double, mem_type::owner, resource::host> &dest,
int const segment_size, distribution_plan const &plan, int const my_rank);

template void exchange_results(
fk::vector<double, mem_type::view, resource::host> const &source,
fk::vector<double, mem_type::owner, resource::host> &dest,
int const segment_size, distribution_plan const &plan, int const my_rank);

template void exchange_results(
fk::vector<double, mem_type::owner, resource::host> const &source,
fk::vector<double, mem_type::view, resource::host> &dest,
int const segment_size, distribution_plan const &plan, int const my_rank);

#ifdef ASGARD_USE_CUDA
template void reduce_results(
fk::vector<double, mem_type::owner, resource::device> const &source,
Expand Down Expand Up @@ -1220,11 +1281,19 @@ template void scatter_matrix<double>(double const *A, int const *descA,

#ifdef ASGARD_ENABLE_FLOAT

template void reduce_results(float const source, float &dest,
distribution_plan const &plan, int const my_rank);

template void
reduce_results(fk::vector<float, mem_type::owner, resource::host> const &source,
fk::vector<float, mem_type::owner, resource::host> &dest,
distribution_plan const &plan, int const my_rank);

template void
reduce_results(fk::vector<float, mem_type::owner, resource::host> const &source,
fk::vector<float, mem_type::view, resource::host> &dest,
distribution_plan const &plan, int const my_rank);

template void exchange_results(
fk::vector<float, mem_type::owner, resource::host> const &source,
fk::vector<float, mem_type::owner, resource::host> &dest,
Expand Down
4 changes: 4 additions & 0 deletions src/distribution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,10 @@ template<typename P, mem_type src_mem, mem_type dst_mem, resource resrc>
void reduce_results(fk::vector<P, src_mem, resrc> const &source,
fk::vector<P, dst_mem, resrc> &dest,
distribution_plan const &plan, int const my_rank);
template<typename P, std::enable_if_t<std::is_floating_point<P>::value, bool> = true>
void reduce_results(P const source,
P &dest,
distribution_plan const &plan, int const my_rank);

// generate a message list for each rank for exchange_results function;
// conceptually an internal component function, exposed for testing
Expand Down
77 changes: 77 additions & 0 deletions src/distribution_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,83 @@ TEMPLATE_TEST_CASE("allreduce across row of subgrids", "[distribution]",
}
}

TEMPLATE_TEST_CASE("allreduce element across row", "[distribution]",
test_precs)
{
if (!is_active())
{
return;
}

SECTION("1 rank")
{
auto const num_ranks = 1;
auto const my_rank = 0;
int const degree = 4;
int const level = 2;

options const o = make_options(
{"-l", std::to_string(level), "-d", std::to_string(degree)});
auto const pde =
make_PDE<default_precision>(PDE_opts::continuity_2, level, degree);
elements::table const table(o, *pde);

auto const plan = get_plan(num_ranks, table);

TestType const gold{42};
TestType const x = gold;
TestType fx = 0.;
reduce_results(x, fx, plan, my_rank);
REQUIRE(fx == gold);
}

SECTION("multiple ranks")
{
#ifdef ASGARD_USE_MPI

int const my_rank = distrib_test_info.get_my_rank();
int const num_ranks = distrib_test_info.get_num_ranks();

if (my_rank < num_ranks)
{
int const degree = 5;
int const level = 4;

options const o = make_options(
{"-l", std::to_string(level), "-d", std::to_string(degree)});

auto const pde =
make_PDE<default_precision>(PDE_opts::continuity_3, level, degree);
elements::table const table(o, *pde);

auto const plan = get_plan(num_ranks, table);

std::vector<TestType> rank_outputs;
for (int i = 0; i < static_cast<int>(plan.size()); ++i)
{
rank_outputs.push_back(i);
}
int const my_row = my_rank / get_num_subgrid_cols(num_ranks);
TestType gold{0};
for (int i = 0; i < static_cast<int>(rank_outputs.size()); ++i)
{
if (i / get_num_subgrid_cols(num_ranks) == my_row)
{
gold = gold + rank_outputs[i];
}
}

auto const &x = rank_outputs[std::min(my_rank, static_cast<int>(plan.size()) - 1)];
TestType fx = 0.;
reduce_results(x, fx, plan, my_rank);
REQUIRE_THAT(fx, Catch::Matchers::WithinAbs(gold, std::numeric_limits<TestType>::epsilon()));
}
#else
REQUIRE(true);
#endif
}
}

void generate_messages_test(int const num_ranks, elements::table const &table)
{
auto const plan = get_plan(num_ranks, table);
Expand Down
2 changes: 1 addition & 1 deletion src/program_options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ parser::parser(int argc, char const *const *argv)
if ((use_implicit_stepping || use_imex_stepping) && get_num_ranks() > 1)
{
auto const choice = solver_mapping.at(solver_str);
if (choice != solve_opts::scalapack)
if (choice == solve_opts::direct)
{
std::cerr << "Distribution not implemented for implicit stepping\n";
valid = false;
Expand Down
Loading