-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
WIP: - introducing chunked all-to-all collective communication in ord…
…er to work around 2GB MPI limit; - some refactoring of read_projection_datasets in order to avoid haphazard increments of end ranges;
- Loading branch information
Showing
22 changed files
with
548 additions
and
266 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
#ifndef CHUNK_INFO_HH | ||
#define CHUNK_INFO_HH | ||
|
||
#include <set> | ||
#include <algorithm> | ||
|
||
using namespace std; | ||
|
||
namespace neuroh5 | ||
{ | ||
namespace data | ||
{ | ||
|
||
// Constants for chunking | ||
constexpr size_t CHUNK_SIZE = 1ULL << 30; // 1GB chunks for safety margin | ||
|
||
template<typename T> | ||
struct ChunkInfo { | ||
std::vector<int> sendcounts; | ||
std::vector<int> sdispls; | ||
std::vector<int> recvcounts; | ||
std::vector<int> rdispls; | ||
size_t total_send_size; | ||
size_t total_recv_size; | ||
}; | ||
|
||
|
||
template<typename T> | ||
ChunkInfo<T> calculate_chunk_sizes( | ||
const std::vector<size_t>& full_sendcounts, | ||
const std::vector<size_t>& full_sdispls, | ||
const std::vector<size_t>& full_recvcounts, | ||
const std::vector<size_t>& full_rdispls, | ||
size_t chunk_start, | ||
size_t chunk_size) | ||
{ | ||
const size_t size = full_sendcounts.size(); | ||
ChunkInfo<T> chunk; | ||
chunk.sendcounts.resize(size); | ||
chunk.sdispls.resize(size); | ||
chunk.recvcounts.resize(size); | ||
chunk.rdispls.resize(size); | ||
|
||
chunk.total_send_size = 0; | ||
chunk.total_recv_size = 0; | ||
|
||
for (size_t i = 0; i < size; ++i) { | ||
// Calculate how much data to send in this chunk | ||
size_t send_remaining = (chunk_start < full_sendcounts[i]) ? | ||
full_sendcounts[i] - chunk_start : 0; | ||
chunk.sendcounts[i] = static_cast<int>(std::min(send_remaining, chunk_size)); | ||
chunk.sdispls[i] = static_cast<int>(full_sdispls[i] + chunk_start); | ||
chunk.total_send_size += chunk.sendcounts[i]; | ||
|
||
// Calculate how much data to receive in this chunk | ||
size_t recv_remaining = (chunk_start < full_recvcounts[i]) ? | ||
full_recvcounts[i] - chunk_start : 0; | ||
chunk.recvcounts[i] = static_cast<int>(std::min(recv_remaining, chunk_size)); | ||
chunk.rdispls[i] = static_cast<int>(full_rdispls[i] + chunk_start); | ||
chunk.total_recv_size += chunk.recvcounts[i]; | ||
} | ||
|
||
return chunk; | ||
} | ||
|
||
|
||
} | ||
} | ||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
// -*- mode: c++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- | ||
//============================================================================== | ||
/// @file allgatherv_template.hh | ||
/// | ||
/// Function for sending data via MPI Allgatherv. | ||
/// | ||
/// Copyright (C) 2017-2024 Project NeuroH5. | ||
//============================================================================== | ||
|
||
#ifndef ALLGATHERV_TEMPLATE_HH | ||
#define ALLGATHERV_TEMPLATE_HH | ||
|
||
#include <mpi.h> | ||
|
||
#include <vector> | ||
#include <map> | ||
|
||
#include "mpi_debug.hh" | ||
#include "throw_assert.hh" | ||
#include "neuroh5_types.hh" | ||
#include "attr_map.hh" | ||
#include "chunk_info.hh" | ||
|
||
using namespace std; | ||
|
||
|
||
namespace neuroh5 | ||
{ | ||
|
||
namespace mpi | ||
{ | ||
|
||
template<class T> | ||
int allgatherv_vector (MPI_Comm comm, | ||
const MPI_Datatype datatype, | ||
size_t sendcount, | ||
const vector<T>& sendbuf, | ||
vector<size_t>& recvcounts, | ||
vector<size_t>& rdispls, | ||
vector<T>& recvbuf) | ||
{ | ||
int ssize; size_t size; | ||
throw_assert(MPI_Comm_size(comm, &ssize) == MPI_SUCCESS, | ||
"allgatherv: unable to obtain size of MPI communicator"); | ||
throw_assert_nomsg(ssize > 0); | ||
size = ssize; | ||
|
||
// Exchange counts | ||
|
||
/*************************************************************************** | ||
* Send MPI data with Allgatherv | ||
**************************************************************************/ | ||
recvcounts.resize(size,0); | ||
rdispls.resize(size,0); | ||
|
||
// 1. Each ALL_COMM rank sends a data size to every other rank and | ||
// creates sendcounts and sdispls arrays | ||
|
||
{ | ||
int status; | ||
MPI_Request request; | ||
|
||
status = MPI_Iallgather(&sendcount, 1, MPI_SIZE_T, | ||
&recvcounts[0], 1, MPI_SIZE_T, comm, | ||
&request); | ||
throw_assert(status == MPI_SUCCESS, | ||
"allgatherv: error in MPI_Iallgather: status: " << status); | ||
status = MPI_Wait(&request, MPI_STATUS_IGNORE); | ||
throw_assert(status == MPI_SUCCESS, | ||
"allgatherv: error in MPI_Wait: status: " << status); | ||
|
||
} | ||
|
||
// 2. Each rank accumulates the vector sizes and allocates | ||
// a receive buffer, recvcounts, and rdispls | ||
rdispls[0] = 0; | ||
size_t recvbuf_size = recvcounts[0]; | ||
for (size_t p = 1; p < size; ++p) | ||
{ | ||
rdispls[p] = rdispls[p-1] + recvcounts[p-1]; | ||
recvbuf_size += recvcounts[p]; | ||
} | ||
|
||
//assert(recvbuf_size > 0); | ||
recvbuf.resize(recvbuf_size, 0); | ||
|
||
{ | ||
int status; | ||
MPI_Request request; | ||
|
||
// 3. Perform the actual data exchange in chunks | ||
size_t chunk_start = 0; | ||
while (true) | ||
{ | ||
size_t global_remaining = 0; | ||
size_t remaining = 0; | ||
if (sendcount > chunk_start) | ||
{ | ||
remaining = sendcount - chunk_start; | ||
} | ||
|
||
status = MPI_Iallreduce(&remaining, | ||
&global_remaining, | ||
1, MPI_SIZE_T, MPI_SUM, | ||
comm, &request); | ||
|
||
throw_assert (status == MPI_SUCCESS, "error in MPI_Iallreduce: status = " << status); | ||
status = MPI_Wait(&request, MPI_STATUS_IGNORE); | ||
throw_assert(status == MPI_SUCCESS, | ||
"allgatherv: error in MPI_Wait: status: " << status); | ||
|
||
if (global_remaining == 0) | ||
break; | ||
|
||
size_t current_chunk = (chunk_start < sendcount) ? std::min(data::CHUNK_SIZE, sendcount - chunk_start) : 0; | ||
|
||
std::vector<int> chunk_recvcounts(size); | ||
std::vector<int> chunk_displs(size); | ||
|
||
for (rank_t i = 0; i < size; ++i) { | ||
chunk_recvcounts[i] = static_cast<int>( | ||
((chunk_start < recvcounts[i]) ? (recvcounts[i] - chunk_start) : 0)); | ||
chunk_displs[i] = static_cast<int>(rdispls[i] + chunk_start); | ||
} | ||
|
||
|
||
status = MPI_Iallgatherv(&sendbuf[chunk_start], | ||
static_cast<int>(current_chunk), | ||
datatype, | ||
&recvbuf[0], | ||
&chunk_recvcounts[0], | ||
&chunk_displs[0], | ||
datatype, | ||
comm, &request); | ||
throw_assert (status == MPI_SUCCESS, "error in MPI_Alltoallv: status = " << status); | ||
status = MPI_Wait(&request, MPI_STATUS_IGNORE); | ||
throw_assert(status == MPI_SUCCESS, | ||
"allgatherv: error in MPI_Wait: status: " << status); | ||
chunk_start = chunk_start + current_chunk; | ||
} | ||
} | ||
|
||
return MPI_SUCCESS; | ||
} | ||
} | ||
} | ||
|
||
#endif |
Oops, something went wrong.