Skip to content

Commit

Permalink
Include MPI in GEMM code
Browse files Browse the repository at this point in the history
  • Loading branch information
khsa1 committed Jul 29, 2020
1 parent e93a484 commit 7c547c1
Showing 1 changed file with 145 additions and 78 deletions.
223 changes: 145 additions & 78 deletions Tacho_ExampleDenseByBlocks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#ifdef TACHO_HAVE_MKL
#include "mkl_service.h"
#endif
#include <mpi.h>

using namespace Tacho;

Expand All @@ -32,21 +33,54 @@ int main (int argc, char *argv[]) {
bool serial = false;
int nthreads = 1;
bool verbose = true;
int mbeg = 1000;
int mend = 6000;
int step = 1000;
int mb = 128;
int rank, np;
ordinal_type m = 1000;
ordinal_type l_m = 1000;
const double alpha = 1.0, beta = 1.0;
bool local = false;
MPI_Status statuses[4];
MPI_Request send_requests[2], recv_requests[2], requests[4];
int ierr;

opts.set_option<bool>("serial", "Flag for invoking serial algorithm", &serial);
opts.set_option<int>("kokkos-threads", "Number of threads", &nthreads);
opts.set_option<bool>("verbose", "Flag for verbose printing", &verbose);
opts.set_option<int>("begin", "Test problem begin size", &mbeg);
opts.set_option<int>("end", "Test problem end size", &mend);
opts.set_option<int>("step", "Test problem step size", &step);
opts.set_option<int>("mb", "Blocksize", &mb);
opts.set_option<ordinal_type>("local-size", "Test problem local size", &l_m);
opts.set_option<ordinal_type>("size", "Test problem step size", &m);
opts.set_option<int>("mb", "Blocksize", &mb);
opts.set_option<bool>("local", "Set local size", &local);

const bool r_parse = opts.parse(argc, argv);
if (r_parse) return 0; // print help return
int r_val = 0;

MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &np);
printf("Process %d of %d\n", rank, np);
if(sqrt(np) != int(sqrt(np))) {printf("Number of ranks must be a perfect square\n"); return 0; }
int p = sqrt(np);

if(local) {
m = l_m * p;
}
else {
l_m = m / p;
ordinal_type rem = m % p;
if(rem != 0) {if(rank==0) printf("Number of ranks must divide into size\n"); return 0; }
}
int periods[]={1,1}; //both vertical and horizontal movement;
int dims[]={p,p};
int coords[2]; /* 2 Dimension topology so 2 coordinates */
int right=0, left=0, down=0, up=0; // neighbor ranks
MPI_Comm cart_comm;
MPI_Cart_create(MPI_COMM_WORLD,2,dims,periods,1,&cart_comm );

MPI_Comm_rank(cart_comm,&rank);
MPI_Cart_coords(cart_comm,rank,2,coords);
MPI_Cart_shift(cart_comm, 1, coords[0], &left,&right);
MPI_Cart_shift(cart_comm, 0, coords[1], &up,&down);

Kokkos::initialize(argc, argv);

Expand All @@ -59,7 +93,6 @@ int main (int argc, char *argv[]) {
printExecSpaceConfiguration<host_exec_space>("Default HostSpace");
printExecSpaceConfiguration< exec_space>("Default DeviceSpace");

int r_val = 0;
const double eps = std::numeric_limits<double>::epsilon()*10000;
{
typedef DenseMatrixView<value_type,exec_space> DenseMatrixViewType;
Expand All @@ -73,38 +106,43 @@ int main (int argc, char *argv[]) {
typedef Kokkos::TaskScheduler<exec_space> sched_type;
sched_type sched;

typedef TaskFunctor_Chol<sched_type,DenseMatrixOfBlocksType,
Uplo::Upper,Algo::ByBlocks> task_functor_chol;
typedef TaskFunctor_Trsm<sched_type,double,DenseMatrixOfBlocksType,
Side::Left,Uplo::Upper,Trans::ConjTranspose,Diag::NonUnit,Algo::ByBlocks> task_functor_trsm;
typedef TaskFunctor_Gemm<sched_type,double,DenseMatrixOfBlocksType,
Trans::NoTranspose,Trans::NoTranspose,Algo::ByBlocks> task_functor_gemm;
typedef TaskFunctor_Herk<sched_type,double,DenseMatrixOfBlocksType,
Uplo::Upper,Trans::ConjTranspose,Algo::ByBlocks> task_functor_herk;

const ordinal_type max_functor_size = 4*sizeof(task_functor_gemm);

Kokkos::DualView<value_type*,exec_space>
a("a", mend*mend), a1("a1", mend*mend), a2("a2", mend*mend),
b("b", mend*mend);
l_a("l_a", l_m*l_m),
l_c("l_c", l_m*l_m),
l_b("l_b", l_m*l_m);

Kokkos::DualView<value_type*,exec_space>
l_a2("l_a2", l_m*l_m),
l_b2("l_b2", l_m*l_m);

const ordinal_type bmend = (mend/mb) + 1;
const ordinal_type l_bmend = (l_m/mb) + 1;
Kokkos::DualView<DenseMatrixViewType*,exec_space>
ha("ha", l_bmend*l_bmend), hb("hb", l_bmend*l_bmend), hc("hc", l_bmend*l_bmend);
Kokkos::DualView<DenseMatrixViewType*,exec_space>
ha("ha", bmend*bmend), hb("hb", bmend*bmend), hc("hc", bmend*bmend);
ha2("ha2", l_bmend*l_bmend), hb2("hb2", l_bmend*l_bmend);

{
const ordinal_type
task_queue_capacity_tmp = 2*bmend*bmend*bmend*max_functor_size,
task_queue_capacity_tmp = 2*l_bmend*l_bmend*l_bmend*max_functor_size,
min_block_size = 16,
max_block_size = 4*max_functor_size,
num_superblock = 4,
superblock_size = std::max(task_queue_capacity_tmp/num_superblock,max_block_size),
task_queue_capacity = std::max(task_queue_capacity_tmp,superblock_size*num_superblock);

if(rank==0) {
std::cout << "capacity = " << task_queue_capacity << "\n";
std::cout << "min_block_size = " << min_block_size << "\n";
std::cout << "max_block_size = " << max_block_size << "\n";
std::cout << "superblock_size = " << superblock_size << "\n";
std::cout << "local_array_size = " << l_m*l_m << "\n";
std::cout << "array_size = " << m*m << "\n";
}

sched = sched_type(typename sched_type::memory_space(),
(size_t)task_queue_capacity,
Expand All @@ -113,7 +151,7 @@ int main (int argc, char *argv[]) {
(unsigned)superblock_size);
}

const ordinal_type dry = 0, niter = 1;
const ordinal_type dry = -2, niter = 3;

double t_reference = 0, t_byblocks = 0;

Expand All @@ -122,123 +160,152 @@ int main (int argc, char *argv[]) {
const ordinal_type m = mat.extent(0), n = mat.extent(1);
for (ordinal_type j=0;j<n;++j)
for (ordinal_type i=0;i<m;++i)
mat(i,j) = random.value();
mat(i,j) = random.value()*(rank+1);
};
auto zeroize= [&](const DenseMatrixViewHostType &mat) {
const ordinal_type m = mat.extent(0), n = mat.extent(1);
for (ordinal_type j=0;j<n;++j)
for (ordinal_type i=0;i<m;++i)
mat(i,j) = 0;
};

///
/// Gemm
///
for (ordinal_type m=mbeg;m<=mend;m+=step) {
double t1, t2;
t1 = MPI_Wtime();
//for (ordinal_type m=mbeg;m<=mend;m+=step) {
t_reference = 0; t_byblocks = 0;
auto sub_a = Kokkos::subview(a, range_type(0,m*m));
auto sub_b = Kokkos::subview(b, range_type(0,m*m));
auto sub_a1 = Kokkos::subview(a1, range_type(0,m*m));
auto sub_a2 = Kokkos::subview(a2, range_type(0,m*m));

{
sub_a. modify<host_exec_space>();
sub_b. modify<host_exec_space>();
sub_a1.modify<host_exec_space>();

DenseMatrixViewHostType A, B, C;
A.set_view(m, m);
A.attach_buffer(1, m, sub_a.h_view.data());

B.set_view(m, m);
B.attach_buffer(1, m, sub_b.h_view.data());

C.set_view(m, m);
C.attach_buffer(1, m, sub_a1.h_view.data());
auto l_sub_a = Kokkos::subview(l_a, range_type(0,l_m*l_m));
auto l_sub_b = Kokkos::subview(l_b, range_type(0,l_m*l_m));
auto l_sub_c = Kokkos::subview(l_c, range_type(0,l_m*l_m));

randomize(A);
randomize(B);
randomize(C);
auto l_sub_a2 = Kokkos::subview(l_a2, range_type(0,l_m*l_m));
auto l_sub_b2 = Kokkos::subview(l_b2, range_type(0,l_m*l_m));

sub_a2.modify<exec_space>();
Kokkos::deep_copy(sub_a2.d_view, sub_a1.h_view);
}

// dense by blocks
{
sub_a. sync <exec_space>();
sub_b. sync <exec_space>();
sub_a2.sync <exec_space>();
sub_a2.modify<exec_space>();
l_sub_b. modify<host_exec_space>();
l_sub_b2. modify<host_exec_space>();

DenseMatrixViewHostType l_A, l_B, l_C;
DenseMatrixViewHostType l_A2, l_B2;
l_A.set_view(l_m, l_m);
l_A.attach_buffer(1, l_m, l_sub_a.h_view.data());
l_A2.set_view(l_m, l_m);
l_A2.attach_buffer(1, l_m, l_sub_a.h_view.data());

DenseMatrixViewType A, B, C;
A.set_view(m, m);
A.attach_buffer(1, m, sub_a.d_view.data());
l_B.set_view(l_m, l_m);
l_B.attach_buffer(1, l_m, l_sub_b.h_view.data());
l_B2.set_view(l_m, l_m);
l_B2.attach_buffer(1, l_m, l_sub_b.h_view.data());

B.set_view(m, m);
B.attach_buffer(1, m, sub_b.d_view.data());
l_C.set_view(l_m, l_m);
l_C.attach_buffer(1, l_m, l_sub_c.h_view.data());

C.set_view(m, m);
C.attach_buffer(1, m, sub_a2.d_view.data());
randomize(l_A);
randomize(l_B);
zeroize(l_C);

const ordinal_type bm = (m/mb) + (m%mb>0);

const ordinal_type bm = (l_m/mb) + (l_m%mb>0);

ha.modify<host_exec_space>();
hb.modify<host_exec_space>();
hc.modify<host_exec_space>();

DenseMatrixOfBlocksHostType HA, HB, HC;
DenseMatrixOfBlocksHostType HA2, HB2;

HA.set_view(bm, bm);
HA.attach_buffer(1, bm, ha.h_view.data());
HA2.set_view(bm, bm);
HA2.attach_buffer(1, bm, ha2.h_view.data());

HB.set_view(bm, bm);
HB.attach_buffer(1, bm, hb.h_view.data());
HB2.set_view(bm, bm);
HB2.attach_buffer(1, bm, hb2.h_view.data());

HC.set_view(bm, bm);
HC.attach_buffer(1, bm, hc.h_view.data());

setMatrixOfBlocks(HA, m, m, mb);
attachBaseBuffer(HA, A.data(), A.stride_0(), A.stride_1());
setMatrixOfBlocks(HA, l_m, l_m, mb);
attachBaseBuffer(HA, l_A.data(), l_A.stride_0(), l_A.stride_1());
setMatrixOfBlocks(HA2, l_m, l_m, mb);
attachBaseBuffer(HA2, l_A2.data(), l_A2.stride_0(), l_A2.stride_1());

setMatrixOfBlocks(HB, m, m, mb);
attachBaseBuffer(HB, B.data(), B.stride_0(), B.stride_1());
setMatrixOfBlocks(HB, l_m, l_m, mb);
attachBaseBuffer(HB, l_B.data(), l_B.stride_0(), l_B.stride_1());
setMatrixOfBlocks(HB2, l_m, l_m, mb);
attachBaseBuffer(HB2, l_B2.data(), l_B2.stride_0(), l_B2.stride_1());

setMatrixOfBlocks(HC, m, m, mb);
attachBaseBuffer(HC, C.data(), C.stride_0(), C.stride_1());
setMatrixOfBlocks(HC, l_m, l_m, mb);
attachBaseBuffer(HC, l_C.data(), l_C.stride_0(), l_C.stride_1());

ha.sync<exec_space>();
hb.sync<exec_space>();
hc.sync<exec_space>();
ha2.sync<exec_space>();
hb2.sync<exec_space>();

DenseMatrixOfBlocksType DA, DB, DC;
DenseMatrixOfBlocksType DA2, DB2;

DA.set_view(bm, bm);
DA.attach_buffer(1, bm, ha.d_view.data());
DA2.set_view(bm, bm);
DA2.attach_buffer(1, bm, ha2.d_view.data());

DB.set_view(bm, bm);
DB.attach_buffer(1, bm, hb.d_view.data());
DB2.set_view(bm, bm);
DB2.attach_buffer(1, bm, hb2.d_view.data());

DC.set_view(bm, bm);
DC.attach_buffer(1, bm, hc.d_view.data());

MPI_Sendrecv_replace(&(l_A(0,0)),l_m*l_m,MPI_DOUBLE,left,11,right,11,cart_comm,MPI_STATUS_IGNORE);
MPI_Sendrecv_replace(&(l_B(0,0)),l_m*l_m,MPI_DOUBLE,up,11,down,11,cart_comm,MPI_STATUS_IGNORE);
{
const double alpha = -1.0, beta = 1.0;
double loop1_start = MPI_Wtime();
timer.reset();
Kokkos::host_spawn(Kokkos::TaskSingle(sched, Kokkos::TaskPriority::High),
task_functor_gemm(sched, alpha, DA, DB, beta, DC));
Kokkos::wait(sched);
double loop1_end = MPI_Wtime();
}
MPI_Cart_shift(cart_comm, 1, 1, &left,&right);
MPI_Cart_shift(cart_comm, 0, 1, &up,&down);
MPI_Sendrecv_replace(&(l_A(0,0)),l_m*l_m,MPI_DOUBLE,left,11,right,11,cart_comm,MPI_STATUS_IGNORE);
MPI_Sendrecv_replace(&(l_B(0,0)),l_m*l_m,MPI_DOUBLE,up,11,down,11,cart_comm,MPI_STATUS_IGNORE);
for(int i=1; i<p; i++) {
MPI_Isend(&(l_A(0,0)), l_m*l_m, MPI_DOUBLE, left, 0, cart_comm, &requests[0]);
MPI_Irecv(&(l_A2(0,0)), l_m*l_m, MPI_DOUBLE, right, MPI_ANY_TAG, cart_comm, &requests[1]);
MPI_Isend(&(l_B(0,0)), l_m*l_m, MPI_DOUBLE, up, 0, cart_comm, &requests[2]);
MPI_Irecv(&(l_B2(0,0)), l_m*l_m, MPI_DOUBLE, down, MPI_ANY_TAG, cart_comm, &requests[3]);
double loop_start = MPI_Wtime();
for (ordinal_type iter=dry;iter<niter;++iter) {
timer.reset();
Kokkos::host_spawn(Kokkos::TaskSingle(sched, Kokkos::TaskPriority::High),
task_functor_gemm(sched, alpha, DA, DB, beta, DC));
Kokkos::wait(sched);
t_byblocks += (iter >=0)*timer.seconds();
}
t_byblocks /= niter;
//clearFutureOfBlocks(HC);
double loop_end = MPI_Wtime();
MPI_Waitall(4, requests, statuses);
for(int ii=0; ii<l_m; ii++) {
for(int jj=0; jj<l_m; jj++) {
l_A(ii,jj)=l_A2(ii,jj);
l_B(ii,jj)=l_B2(ii,jj);
}
}
}
}

{
a1.sync<host_exec_space>();
a2.sync<host_exec_space>();

}

}
t2 = MPI_Wtime();
if(rank==0) printf("Time spent in GEMM code: %f\n", t2 - t1);
}
Kokkos::finalize();
MPI_Finalize();

return r_val;
}

0 comments on commit 7c547c1

Please sign in to comment.