Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
Contains the following applications:
- MPI+Kokkos tasks GEMM
- Locking MPI+Kokkos tasks graph500
- Non-locking MPI+Kokkos tasks graph500
  • Loading branch information
khsa1 committed Jul 29, 2020
0 parents commit d1ecbe6
Show file tree
Hide file tree
Showing 53 changed files with 12,987 additions and 0 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
*.o
*.a
*.tmp
KokkosCore_config.h
Tacho_ExampleDenseByBlocks
graph500_locking
graph500_nonlocking
37 changes: 37 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
CXX=mpic++
CXXFLAGS=-qopenmp -mkl

TRILINOS_OPT=/fs/project/PZS0530/skhuvis/src/trilinos-build
TRILINOS_SRC=/fs/project/PZS0530/skhuvis/src/Trilinos
KOKKOS_PATH=${TRILINOS_SRC}/packages/kokkos
SHYLU_SRC=${TRILINOS_SRC}/packages/shylu/shylu_node/tacho/src
SHYLU_OPT=${TRILINOS_OPT}/packages/shylu/shylu_node/tacho/src
KOKKOS_OPT=${TRILINOS_OPT}/packages/kokkos
KOKKOS_DEVICES="OpenMP"
KOKKOS_ARCH="BDW"

include ${KOKKOS_PATH}/Makefile.kokkos

INCS=-I${SHYLU_OPT} -I${SHYLU_SRC}

LDFLAGS=-lmetis $(KOKKOS_LDFLAGS) -L${KOKKOS_OPT}/core/src -L${KOKKOS_OPT}/containers/src -L${KOKKOS_OPT}/core/src -L${TRILINOS_OPT}/packages/common/auxiliarySoftware/SuiteSparse/src -L${SHYLU_OPT} -L${KOKKOS_OPT}/algorithms/src -L${TRILINOS_OPT}/commonTools/gtest -L./ -lkokkoscore -lkokkoscontainers -lshylu_nodetacho -ltrilinosss -lkokkosalgorithms -lgtest


EXE=Tacho_ExampleDenseByBlocks graph500_nonlocking graph500_locking

all: $(EXE)

Tacho_ExampleDenseByBlocks: Tacho_ExampleDenseByBlocks.o $(KOKKOS_LINK_DEPENDS)
$(CXX) $(CXXFLAGS) -o Tacho_ExampleDenseByBlocks Tacho_ExampleDenseByBlocks.o $(LDFLAGS) $(KOKKOS_LIBS)

graph500_nonlocking:
make -C graph500/kokkos graph500_nonlocking INCS="${INCS}" KOKKOS_PATH="${KOKKOS_PATH}" KOKKOS_DEVICES="$(KOKKOS_DEVICES)" KLDFLAGS="${LDFLAGS}"
graph500_locking:
make -C graph500/kokkos graph500_locking INCS="${INCS}" KOKKOS_PATH="${KOKKOS_PATH}" KOKKOS_DEVICES="$(KOKKOS_DEVICES)" KLDFLAGS="${LDFLAGS}"

%.o : %.cpp
$(CXX) -c $(CXXFLAGS) $(INCS) $(KOKKOS_CXXFLAGS) $< -o $@

clean:
rm -f *.o *.a *.tmp $(EXE)
-make -C graph500/kokkos clean KOKKOS_PATH="${KOKKOS_PATH}"
56 changes: 56 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# README

## Getting Trilinos

```
git clone https://github.com/trilinos/Trilinos.git
git checkout b783a65
```

## Building Kokkos

Run the following cmake command to build Kokkos
```
cmake -D CMAKE_INSTALL_PREFIX=$INSTALL_DIR -D \
TPL_ENABLE_MPI:BOOL=OFF -D Trilinos_ENABLE_Fortran:BOOL=OFF -D TPL_ENABLE_Pthread:BOOL=OFF \
-D Kokkos_ENABLE_Pthread:BOOL=OFF -D Trilinos_ENABLE_OpenMP:BOOL=ON \
-D Kokkos_ENABLE_OpenMP:BOOL=ON -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF \
-D Trilinos_ENABLE_EXAMPLES:BOOL=ON -D Trilinos_ENABLE_TESTS:BOOL=ON \
-D Trilinos_ENABLE_KokkosCore:BOOL=ON -D Trilinos_ENABLE_KokkosContainers:BOOL=ON \
-D Trilinos_ENABLE_KokkosExample:BOOL=OFF -D Trilinos_ENABLE_KokkosAlgorithms:BOOL=ON \
-D Trilinos_ENABLE_CXX11:BOOL=ON -D Kokkos_ENABLE_CXX11:BOOL=ON \
-D Kokkos_ENABLE_Serial:BOOL=ON -D Trilinos_ENABLE_ShyLU:BOOL=OFF \
-D Trilinos_ENABLE_ShyLU_DDCore:BOOL=OFF -D Trilinos_ENABLE_ShyLU_NodeTacho:BOOL=ON \
-D Trilinos_ENABLE_Teuchos:BOOL=ON -D Teuchos_ENABLE_TESTS:BOOL=OFF \
-D TPL_ENABLE_Cholmod:BOOL=OFF -D TPL_ENABLE_METIS:BOOL=ON \
-D METIS_INCLUDE_DIRS:FILEPATH=/ apps/metis/intel/18.0/5.1.0/include \
-D METIS_LIBRARY_DIRS:FILEPATH=/apps/metis/intel/18.0/5.1.0/lib \
-D CMAKE_BUILD_TYPE:STRING=RELEASE -D CMAKE_VERBOSE_MAKEFILE:BOOL=OFF \
-D CMAKE_C_COMPILER=icc -D CMAKE_CXX_COMPILER=icpc \
-D CMAKE_CXX_FLAGS:STRING=-DKOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION \
-D CMAKE_EXE_LINKER_FLAGS:STRING=-lnuma -lrt -ldl -lgfortran \
-D TPL_ENABLE_MKL:BOOL=ON -D TPL_MKL_LIBRARIES:FILEPATH=-mkl \
-D TPL_ENABLE_BLAS:BOOL=ON -D TPL_BLAS_LIBRARIES:FILEPATH=-mkl \
-D TPL_ENABLE_LAPACK:BOOL=ON -D TPL_LAPACK_LIBRARIES:FILEPATH=-mkl \
-D KOKKOS_ARCH=BDW $SRC_DIR
```

`$SRC_DIR` should be replaced with the location where you extracted Kokkos and `$INSTALL_DIR` should be replaced with the installation directory.

## Building MPI+Kokkos codes

The included Makefile contains will build three applications:

1. MPI+Kokkos GEMM code
2. MPI+Kokkos locking Graph500
3. MPI+Kokkos non-locking Graph500

Make sure that you have already built Trilinos. The following changes will need to be made to build correctly:

1. Replace `$TRILINOS_OPT` with `$INSTALL_DIR`
2. Replace `$TRILINOS_SRC` with `$SRC_DIR`

Then, run `make all` to generate the following executables:
1. `Tacho_ExampleDenseByBlocks`
2. `graph500/kokkos/graph500_locking`
3. `graph500/kokkos/graph500_nonlocking`
244 changes: 244 additions & 0 deletions Tacho_ExampleDenseByBlocks.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
#include "ShyLU_NodeTacho_config.h"

#include <Kokkos_Core.hpp>
#include <Kokkos_DualView.hpp>
#include <impl/Kokkos_Timer.hpp>

#include "Tacho_Util.hpp"
#include "Tacho_DenseMatrixView.hpp"
#include "Tacho_DenseFlopCount.hpp"

#include "Tacho_Chol_ByBlocks.hpp"
#include "Tacho_Gemm_ByBlocks.hpp"
#include "Tacho_Herk_ByBlocks.hpp"
#include "Tacho_Trsm_ByBlocks.hpp"

#include "Tacho_CommandLineParser.hpp"

#ifdef TACHO_HAVE_MKL
#include "mkl_service.h"
#endif

using namespace Tacho;

#define PRINT_TIMER \
printf(" Time \n"); \
printf(" byblocks/reference (speedup): %10.6f\n", t_reference/t_byblocks); \
printf("\n");

int main (int argc, char *argv[]) {
CommandLineParser opts("This example program measure the performance of dense-by-blocks on Kokkos::OpenMP");

bool serial = false;
int nthreads = 1;
bool verbose = true;
int mbeg = 1000;
int mend = 6000;
int step = 1000;
int mb = 128;

opts.set_option<bool>("serial", "Flag for invoking serial algorithm", &serial);
opts.set_option<int>("kokkos-threads", "Number of threads", &nthreads);
opts.set_option<bool>("verbose", "Flag for verbose printing", &verbose);
opts.set_option<int>("begin", "Test problem begin size", &mbeg);
opts.set_option<int>("end", "Test problem end size", &mend);
opts.set_option<int>("step", "Test problem step size", &step);
opts.set_option<int>("mb", "Blocksize", &mb);

const bool r_parse = opts.parse(argc, argv);
if (r_parse) return 0; // print help return

Kokkos::initialize(argc, argv);

typedef double value_type;
typedef Kokkos::pair<ordinal_type,ordinal_type> range_type;
typedef Kokkos::DefaultExecutionSpace exec_space;
//typedef Kokkos::DefaultHostExecutionSpace exec_space;
typedef Kokkos::DefaultHostExecutionSpace host_exec_space;

printExecSpaceConfiguration<host_exec_space>("Default HostSpace");
printExecSpaceConfiguration< exec_space>("Default DeviceSpace");

int r_val = 0;
const double eps = std::numeric_limits<double>::epsilon()*10000;
{
typedef DenseMatrixView<value_type,exec_space> DenseMatrixViewType;
typedef DenseMatrixView<DenseMatrixViewType,exec_space> DenseMatrixOfBlocksType;

typedef DenseMatrixView<value_type,host_exec_space> DenseMatrixViewHostType;
typedef DenseMatrixView<DenseMatrixViewType,host_exec_space> DenseMatrixOfBlocksHostType;

Kokkos::Impl::Timer timer;

typedef Kokkos::TaskScheduler<exec_space> sched_type;
sched_type sched;

typedef TaskFunctor_Chol<sched_type,DenseMatrixOfBlocksType,
Uplo::Upper,Algo::ByBlocks> task_functor_chol;
typedef TaskFunctor_Trsm<sched_type,double,DenseMatrixOfBlocksType,
Side::Left,Uplo::Upper,Trans::ConjTranspose,Diag::NonUnit,Algo::ByBlocks> task_functor_trsm;
typedef TaskFunctor_Gemm<sched_type,double,DenseMatrixOfBlocksType,
Trans::NoTranspose,Trans::NoTranspose,Algo::ByBlocks> task_functor_gemm;
typedef TaskFunctor_Herk<sched_type,double,DenseMatrixOfBlocksType,
Uplo::Upper,Trans::ConjTranspose,Algo::ByBlocks> task_functor_herk;

const ordinal_type max_functor_size = 4*sizeof(task_functor_gemm);

Kokkos::DualView<value_type*,exec_space>
a("a", mend*mend), a1("a1", mend*mend), a2("a2", mend*mend),
b("b", mend*mend);

const ordinal_type bmend = (mend/mb) + 1;
Kokkos::DualView<DenseMatrixViewType*,exec_space>
ha("ha", bmend*bmend), hb("hb", bmend*bmend), hc("hc", bmend*bmend);

{
const ordinal_type
task_queue_capacity_tmp = 2*bmend*bmend*bmend*max_functor_size,
min_block_size = 16,
max_block_size = 4*max_functor_size,
num_superblock = 4,
superblock_size = std::max(task_queue_capacity_tmp/num_superblock,max_block_size),
task_queue_capacity = std::max(task_queue_capacity_tmp,superblock_size*num_superblock);

std::cout << "capacity = " << task_queue_capacity << "\n";
std::cout << "min_block_size = " << min_block_size << "\n";
std::cout << "max_block_size = " << max_block_size << "\n";
std::cout << "superblock_size = " << superblock_size << "\n";

sched = sched_type(typename sched_type::memory_space(),
(size_t)task_queue_capacity,
(unsigned)min_block_size,
(unsigned)max_block_size,
(unsigned)superblock_size);
}

const ordinal_type dry = 0, niter = 1;

double t_reference = 0, t_byblocks = 0;

Random<value_type> random;
auto randomize = [&](const DenseMatrixViewHostType &mat) {
const ordinal_type m = mat.extent(0), n = mat.extent(1);
for (ordinal_type j=0;j<n;++j)
for (ordinal_type i=0;i<m;++i)
mat(i,j) = random.value();
};

///
/// Gemm
///
for (ordinal_type m=mbeg;m<=mend;m+=step) {
t_reference = 0; t_byblocks = 0;
auto sub_a = Kokkos::subview(a, range_type(0,m*m));
auto sub_b = Kokkos::subview(b, range_type(0,m*m));
auto sub_a1 = Kokkos::subview(a1, range_type(0,m*m));
auto sub_a2 = Kokkos::subview(a2, range_type(0,m*m));

{
sub_a. modify<host_exec_space>();
sub_b. modify<host_exec_space>();
sub_a1.modify<host_exec_space>();

DenseMatrixViewHostType A, B, C;
A.set_view(m, m);
A.attach_buffer(1, m, sub_a.h_view.data());

B.set_view(m, m);
B.attach_buffer(1, m, sub_b.h_view.data());

C.set_view(m, m);
C.attach_buffer(1, m, sub_a1.h_view.data());

randomize(A);
randomize(B);
randomize(C);

sub_a2.modify<exec_space>();
Kokkos::deep_copy(sub_a2.d_view, sub_a1.h_view);
}

// dense by blocks
{
sub_a. sync <exec_space>();
sub_b. sync <exec_space>();
sub_a2.sync <exec_space>();
sub_a2.modify<exec_space>();

DenseMatrixViewType A, B, C;
A.set_view(m, m);
A.attach_buffer(1, m, sub_a.d_view.data());

B.set_view(m, m);
B.attach_buffer(1, m, sub_b.d_view.data());

C.set_view(m, m);
C.attach_buffer(1, m, sub_a2.d_view.data());

const ordinal_type bm = (m/mb) + (m%mb>0);

ha.modify<host_exec_space>();
hb.modify<host_exec_space>();
hc.modify<host_exec_space>();

DenseMatrixOfBlocksHostType HA, HB, HC;

HA.set_view(bm, bm);
HA.attach_buffer(1, bm, ha.h_view.data());

HB.set_view(bm, bm);
HB.attach_buffer(1, bm, hb.h_view.data());

HC.set_view(bm, bm);
HC.attach_buffer(1, bm, hc.h_view.data());

setMatrixOfBlocks(HA, m, m, mb);
attachBaseBuffer(HA, A.data(), A.stride_0(), A.stride_1());

setMatrixOfBlocks(HB, m, m, mb);
attachBaseBuffer(HB, B.data(), B.stride_0(), B.stride_1());

setMatrixOfBlocks(HC, m, m, mb);
attachBaseBuffer(HC, C.data(), C.stride_0(), C.stride_1());

ha.sync<exec_space>();
hb.sync<exec_space>();
hc.sync<exec_space>();

DenseMatrixOfBlocksType DA, DB, DC;

DA.set_view(bm, bm);
DA.attach_buffer(1, bm, ha.d_view.data());

DB.set_view(bm, bm);
DB.attach_buffer(1, bm, hb.d_view.data());

DC.set_view(bm, bm);
DC.attach_buffer(1, bm, hc.d_view.data());

{
const double alpha = -1.0, beta = 1.0;
for (ordinal_type iter=dry;iter<niter;++iter) {
timer.reset();
Kokkos::host_spawn(Kokkos::TaskSingle(sched, Kokkos::TaskPriority::High),
task_functor_gemm(sched, alpha, DA, DB, beta, DC));
Kokkos::wait(sched);
t_byblocks += (iter >=0)*timer.seconds();
}
t_byblocks /= niter;
//clearFutureOfBlocks(HC);
}
}

{
a1.sync<host_exec_space>();
a2.sync<host_exec_space>();

}

}
}
Kokkos::finalize();

return r_val;
}
Loading

0 comments on commit d1ecbe6

Please sign in to comment.