-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Contains the following applications: - MPI+Kokkos tasks GEMM - Locking MPI+Kokkos tasks graph500 - Non-locking MPI+Kokkos tasks graph500
- Loading branch information
0 parents
commit d1ecbe6
Showing
53 changed files
with
12,987 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
*.o | ||
*.a | ||
*.tmp | ||
KokkosCore_config.h | ||
Tacho_ExampleDenseByBlocks | ||
graph500_locking | ||
graph500_nonlocking |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
CXX=mpic++ | ||
CXXFLAGS=-qopenmp -mkl | ||
|
||
TRILINOS_OPT=/fs/project/PZS0530/skhuvis/src/trilinos-build | ||
TRILINOS_SRC=/fs/project/PZS0530/skhuvis/src/Trilinos | ||
KOKKOS_PATH=${TRILINOS_SRC}/packages/kokkos | ||
SHYLU_SRC=${TRILINOS_SRC}/packages/shylu/shylu_node/tacho/src | ||
SHYLU_OPT=${TRILINOS_OPT}/packages/shylu/shylu_node/tacho/src | ||
KOKKOS_OPT=${TRILINOS_OPT}/packages/kokkos | ||
KOKKOS_DEVICES="OpenMP" | ||
KOKKOS_ARCH="BDW" | ||
|
||
include ${KOKKOS_PATH}/Makefile.kokkos | ||
|
||
INCS=-I${SHYLU_OPT} -I${SHYLU_SRC} | ||
|
||
LDFLAGS=-lmetis $(KOKKOS_LDFLAGS) -L${KOKKOS_OPT}/core/src -L${KOKKOS_OPT}/containers/src -L${KOKKOS_OPT}/core/src -L${TRILINOS_OPT}/packages/common/auxiliarySoftware/SuiteSparse/src -L${SHYLU_OPT} -L${KOKKOS_OPT}/algorithms/src -L${TRILINOS_OPT}/commonTools/gtest -L./ -lkokkoscore -lkokkoscontainers -lshylu_nodetacho -ltrilinosss -lkokkosalgorithms -lgtest | ||
|
||
|
||
EXE=Tacho_ExampleDenseByBlocks graph500_nonlocking graph500_locking | ||
|
||
all: $(EXE) | ||
|
||
Tacho_ExampleDenseByBlocks: Tacho_ExampleDenseByBlocks.o $(KOKKOS_LINK_DEPENDS) | ||
$(CXX) $(CXXFLAGS) -o Tacho_ExampleDenseByBlocks Tacho_ExampleDenseByBlocks.o $(LDFLAGS) $(KOKKOS_LIBS) | ||
|
||
graph500_nonlocking: | ||
make -C graph500/kokkos graph500_nonlocking INCS="${INCS}" KOKKOS_PATH="${KOKKOS_PATH}" KOKKOS_DEVICES="$(KOKKOS_DEVICES)" KLDFLAGS="${LDFLAGS}" | ||
graph500_locking: | ||
make -C graph500/kokkos graph500_locking INCS="${INCS}" KOKKOS_PATH="${KOKKOS_PATH}" KOKKOS_DEVICES="$(KOKKOS_DEVICES)" KLDFLAGS="${LDFLAGS}" | ||
|
||
%.o : %.cpp | ||
$(CXX) -c $(CXXFLAGS) $(INCS) $(KOKKOS_CXXFLAGS) $< -o $@ | ||
|
||
clean: | ||
rm -f *.o *.a *.tmp $(EXE) | ||
-make -C graph500/kokkos clean KOKKOS_PATH="${KOKKOS_PATH}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
# README | ||
|
||
## Getting Trilinos | ||
|
||
``` | ||
git clone https://github.com/trilinos/Trilinos.git | ||
git checkout b783a65 | ||
``` | ||
|
||
## Building Kokkos | ||
|
||
Run the following cmake command to build Kokkos | ||
``` | ||
cmake -D CMAKE_INSTALL_PREFIX=$INSTALL_DIR -D \ | ||
TPL_ENABLE_MPI:BOOL=OFF -D Trilinos_ENABLE_Fortran:BOOL=OFF -D TPL_ENABLE_Pthread:BOOL=OFF \ | ||
-D Kokkos_ENABLE_Pthread:BOOL=OFF -D Trilinos_ENABLE_OpenMP:BOOL=ON \ | ||
-D Kokkos_ENABLE_OpenMP:BOOL=ON -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF \ | ||
-D Trilinos_ENABLE_EXAMPLES:BOOL=ON -D Trilinos_ENABLE_TESTS:BOOL=ON \ | ||
-D Trilinos_ENABLE_KokkosCore:BOOL=ON -D Trilinos_ENABLE_KokkosContainers:BOOL=ON \ | ||
-D Trilinos_ENABLE_KokkosExample:BOOL=OFF -D Trilinos_ENABLE_KokkosAlgorithms:BOOL=ON \ | ||
-D Trilinos_ENABLE_CXX11:BOOL=ON -D Kokkos_ENABLE_CXX11:BOOL=ON \ | ||
-D Kokkos_ENABLE_Serial:BOOL=ON -D Trilinos_ENABLE_ShyLU:BOOL=OFF \ | ||
-D Trilinos_ENABLE_ShyLU_DDCore:BOOL=OFF -D Trilinos_ENABLE_ShyLU_NodeTacho:BOOL=ON \ | ||
-D Trilinos_ENABLE_Teuchos:BOOL=ON -D Teuchos_ENABLE_TESTS:BOOL=OFF \ | ||
-D TPL_ENABLE_Cholmod:BOOL=OFF -D TPL_ENABLE_METIS:BOOL=ON \ | ||
-D METIS_INCLUDE_DIRS:FILEPATH=/ apps/metis/intel/18.0/5.1.0/include \ | ||
-D METIS_LIBRARY_DIRS:FILEPATH=/apps/metis/intel/18.0/5.1.0/lib \ | ||
-D CMAKE_BUILD_TYPE:STRING=RELEASE -D CMAKE_VERBOSE_MAKEFILE:BOOL=OFF \ | ||
-D CMAKE_C_COMPILER=icc -D CMAKE_CXX_COMPILER=icpc \ | ||
-D CMAKE_CXX_FLAGS:STRING=-DKOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION \ | ||
-D CMAKE_EXE_LINKER_FLAGS:STRING=-lnuma -lrt -ldl -lgfortran \ | ||
-D TPL_ENABLE_MKL:BOOL=ON -D TPL_MKL_LIBRARIES:FILEPATH=-mkl \ | ||
-D TPL_ENABLE_BLAS:BOOL=ON -D TPL_BLAS_LIBRARIES:FILEPATH=-mkl \ | ||
-D TPL_ENABLE_LAPACK:BOOL=ON -D TPL_LAPACK_LIBRARIES:FILEPATH=-mkl \ | ||
-D KOKKOS_ARCH=BDW $SRC_DIR | ||
``` | ||
|
||
`$SRC_DIR` should be replaced with the location where you extracted Kokkos and `$INSTALL_DIR` should be replaced with the installation directory. | ||
|
||
## Building MPI+Kokkos codes | ||
|
||
The included Makefile contains will build three applications: | ||
|
||
1. MPI+Kokkos GEMM code | ||
2. MPI+Kokkos locking Graph500 | ||
3. MPI+Kokkos non-locking Graph500 | ||
|
||
Make sure that you have already built Trilinos. The following changes will need to be made to build correctly: | ||
|
||
1. Replace `$TRILINOS_OPT` with `$INSTALL_DIR` | ||
2. Replace `$TRILINOS_SRC` with `$SRC_DIR` | ||
|
||
Then, run `make all` to generate the following executables: | ||
1. `Tacho_ExampleDenseByBlocks` | ||
2. `graph500/kokkos/graph500_locking` | ||
3. `graph500/kokkos/graph500_nonlocking` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,244 @@ | ||
#include "ShyLU_NodeTacho_config.h" | ||
|
||
#include <Kokkos_Core.hpp> | ||
#include <Kokkos_DualView.hpp> | ||
#include <impl/Kokkos_Timer.hpp> | ||
|
||
#include "Tacho_Util.hpp" | ||
#include "Tacho_DenseMatrixView.hpp" | ||
#include "Tacho_DenseFlopCount.hpp" | ||
|
||
#include "Tacho_Chol_ByBlocks.hpp" | ||
#include "Tacho_Gemm_ByBlocks.hpp" | ||
#include "Tacho_Herk_ByBlocks.hpp" | ||
#include "Tacho_Trsm_ByBlocks.hpp" | ||
|
||
#include "Tacho_CommandLineParser.hpp" | ||
|
||
#ifdef TACHO_HAVE_MKL | ||
#include "mkl_service.h" | ||
#endif | ||
|
||
using namespace Tacho; | ||
|
||
#define PRINT_TIMER \ | ||
printf(" Time \n"); \ | ||
printf(" byblocks/reference (speedup): %10.6f\n", t_reference/t_byblocks); \ | ||
printf("\n"); | ||
|
||
int main (int argc, char *argv[]) { | ||
CommandLineParser opts("This example program measure the performance of dense-by-blocks on Kokkos::OpenMP"); | ||
|
||
bool serial = false; | ||
int nthreads = 1; | ||
bool verbose = true; | ||
int mbeg = 1000; | ||
int mend = 6000; | ||
int step = 1000; | ||
int mb = 128; | ||
|
||
opts.set_option<bool>("serial", "Flag for invoking serial algorithm", &serial); | ||
opts.set_option<int>("kokkos-threads", "Number of threads", &nthreads); | ||
opts.set_option<bool>("verbose", "Flag for verbose printing", &verbose); | ||
opts.set_option<int>("begin", "Test problem begin size", &mbeg); | ||
opts.set_option<int>("end", "Test problem end size", &mend); | ||
opts.set_option<int>("step", "Test problem step size", &step); | ||
opts.set_option<int>("mb", "Blocksize", &mb); | ||
|
||
const bool r_parse = opts.parse(argc, argv); | ||
if (r_parse) return 0; // print help return | ||
|
||
Kokkos::initialize(argc, argv); | ||
|
||
typedef double value_type; | ||
typedef Kokkos::pair<ordinal_type,ordinal_type> range_type; | ||
typedef Kokkos::DefaultExecutionSpace exec_space; | ||
//typedef Kokkos::DefaultHostExecutionSpace exec_space; | ||
typedef Kokkos::DefaultHostExecutionSpace host_exec_space; | ||
|
||
printExecSpaceConfiguration<host_exec_space>("Default HostSpace"); | ||
printExecSpaceConfiguration< exec_space>("Default DeviceSpace"); | ||
|
||
int r_val = 0; | ||
const double eps = std::numeric_limits<double>::epsilon()*10000; | ||
{ | ||
typedef DenseMatrixView<value_type,exec_space> DenseMatrixViewType; | ||
typedef DenseMatrixView<DenseMatrixViewType,exec_space> DenseMatrixOfBlocksType; | ||
|
||
typedef DenseMatrixView<value_type,host_exec_space> DenseMatrixViewHostType; | ||
typedef DenseMatrixView<DenseMatrixViewType,host_exec_space> DenseMatrixOfBlocksHostType; | ||
|
||
Kokkos::Impl::Timer timer; | ||
|
||
typedef Kokkos::TaskScheduler<exec_space> sched_type; | ||
sched_type sched; | ||
|
||
typedef TaskFunctor_Chol<sched_type,DenseMatrixOfBlocksType, | ||
Uplo::Upper,Algo::ByBlocks> task_functor_chol; | ||
typedef TaskFunctor_Trsm<sched_type,double,DenseMatrixOfBlocksType, | ||
Side::Left,Uplo::Upper,Trans::ConjTranspose,Diag::NonUnit,Algo::ByBlocks> task_functor_trsm; | ||
typedef TaskFunctor_Gemm<sched_type,double,DenseMatrixOfBlocksType, | ||
Trans::NoTranspose,Trans::NoTranspose,Algo::ByBlocks> task_functor_gemm; | ||
typedef TaskFunctor_Herk<sched_type,double,DenseMatrixOfBlocksType, | ||
Uplo::Upper,Trans::ConjTranspose,Algo::ByBlocks> task_functor_herk; | ||
|
||
const ordinal_type max_functor_size = 4*sizeof(task_functor_gemm); | ||
|
||
Kokkos::DualView<value_type*,exec_space> | ||
a("a", mend*mend), a1("a1", mend*mend), a2("a2", mend*mend), | ||
b("b", mend*mend); | ||
|
||
const ordinal_type bmend = (mend/mb) + 1; | ||
Kokkos::DualView<DenseMatrixViewType*,exec_space> | ||
ha("ha", bmend*bmend), hb("hb", bmend*bmend), hc("hc", bmend*bmend); | ||
|
||
{ | ||
const ordinal_type | ||
task_queue_capacity_tmp = 2*bmend*bmend*bmend*max_functor_size, | ||
min_block_size = 16, | ||
max_block_size = 4*max_functor_size, | ||
num_superblock = 4, | ||
superblock_size = std::max(task_queue_capacity_tmp/num_superblock,max_block_size), | ||
task_queue_capacity = std::max(task_queue_capacity_tmp,superblock_size*num_superblock); | ||
|
||
std::cout << "capacity = " << task_queue_capacity << "\n"; | ||
std::cout << "min_block_size = " << min_block_size << "\n"; | ||
std::cout << "max_block_size = " << max_block_size << "\n"; | ||
std::cout << "superblock_size = " << superblock_size << "\n"; | ||
|
||
sched = sched_type(typename sched_type::memory_space(), | ||
(size_t)task_queue_capacity, | ||
(unsigned)min_block_size, | ||
(unsigned)max_block_size, | ||
(unsigned)superblock_size); | ||
} | ||
|
||
const ordinal_type dry = 0, niter = 1; | ||
|
||
double t_reference = 0, t_byblocks = 0; | ||
|
||
Random<value_type> random; | ||
auto randomize = [&](const DenseMatrixViewHostType &mat) { | ||
const ordinal_type m = mat.extent(0), n = mat.extent(1); | ||
for (ordinal_type j=0;j<n;++j) | ||
for (ordinal_type i=0;i<m;++i) | ||
mat(i,j) = random.value(); | ||
}; | ||
|
||
/// | ||
/// Gemm | ||
/// | ||
for (ordinal_type m=mbeg;m<=mend;m+=step) { | ||
t_reference = 0; t_byblocks = 0; | ||
auto sub_a = Kokkos::subview(a, range_type(0,m*m)); | ||
auto sub_b = Kokkos::subview(b, range_type(0,m*m)); | ||
auto sub_a1 = Kokkos::subview(a1, range_type(0,m*m)); | ||
auto sub_a2 = Kokkos::subview(a2, range_type(0,m*m)); | ||
|
||
{ | ||
sub_a. modify<host_exec_space>(); | ||
sub_b. modify<host_exec_space>(); | ||
sub_a1.modify<host_exec_space>(); | ||
|
||
DenseMatrixViewHostType A, B, C; | ||
A.set_view(m, m); | ||
A.attach_buffer(1, m, sub_a.h_view.data()); | ||
|
||
B.set_view(m, m); | ||
B.attach_buffer(1, m, sub_b.h_view.data()); | ||
|
||
C.set_view(m, m); | ||
C.attach_buffer(1, m, sub_a1.h_view.data()); | ||
|
||
randomize(A); | ||
randomize(B); | ||
randomize(C); | ||
|
||
sub_a2.modify<exec_space>(); | ||
Kokkos::deep_copy(sub_a2.d_view, sub_a1.h_view); | ||
} | ||
|
||
// dense by blocks | ||
{ | ||
sub_a. sync <exec_space>(); | ||
sub_b. sync <exec_space>(); | ||
sub_a2.sync <exec_space>(); | ||
sub_a2.modify<exec_space>(); | ||
|
||
DenseMatrixViewType A, B, C; | ||
A.set_view(m, m); | ||
A.attach_buffer(1, m, sub_a.d_view.data()); | ||
|
||
B.set_view(m, m); | ||
B.attach_buffer(1, m, sub_b.d_view.data()); | ||
|
||
C.set_view(m, m); | ||
C.attach_buffer(1, m, sub_a2.d_view.data()); | ||
|
||
const ordinal_type bm = (m/mb) + (m%mb>0); | ||
|
||
ha.modify<host_exec_space>(); | ||
hb.modify<host_exec_space>(); | ||
hc.modify<host_exec_space>(); | ||
|
||
DenseMatrixOfBlocksHostType HA, HB, HC; | ||
|
||
HA.set_view(bm, bm); | ||
HA.attach_buffer(1, bm, ha.h_view.data()); | ||
|
||
HB.set_view(bm, bm); | ||
HB.attach_buffer(1, bm, hb.h_view.data()); | ||
|
||
HC.set_view(bm, bm); | ||
HC.attach_buffer(1, bm, hc.h_view.data()); | ||
|
||
setMatrixOfBlocks(HA, m, m, mb); | ||
attachBaseBuffer(HA, A.data(), A.stride_0(), A.stride_1()); | ||
|
||
setMatrixOfBlocks(HB, m, m, mb); | ||
attachBaseBuffer(HB, B.data(), B.stride_0(), B.stride_1()); | ||
|
||
setMatrixOfBlocks(HC, m, m, mb); | ||
attachBaseBuffer(HC, C.data(), C.stride_0(), C.stride_1()); | ||
|
||
ha.sync<exec_space>(); | ||
hb.sync<exec_space>(); | ||
hc.sync<exec_space>(); | ||
|
||
DenseMatrixOfBlocksType DA, DB, DC; | ||
|
||
DA.set_view(bm, bm); | ||
DA.attach_buffer(1, bm, ha.d_view.data()); | ||
|
||
DB.set_view(bm, bm); | ||
DB.attach_buffer(1, bm, hb.d_view.data()); | ||
|
||
DC.set_view(bm, bm); | ||
DC.attach_buffer(1, bm, hc.d_view.data()); | ||
|
||
{ | ||
const double alpha = -1.0, beta = 1.0; | ||
for (ordinal_type iter=dry;iter<niter;++iter) { | ||
timer.reset(); | ||
Kokkos::host_spawn(Kokkos::TaskSingle(sched, Kokkos::TaskPriority::High), | ||
task_functor_gemm(sched, alpha, DA, DB, beta, DC)); | ||
Kokkos::wait(sched); | ||
t_byblocks += (iter >=0)*timer.seconds(); | ||
} | ||
t_byblocks /= niter; | ||
//clearFutureOfBlocks(HC); | ||
} | ||
} | ||
|
||
{ | ||
a1.sync<host_exec_space>(); | ||
a2.sync<host_exec_space>(); | ||
|
||
} | ||
|
||
} | ||
} | ||
Kokkos::finalize(); | ||
|
||
return r_val; | ||
} |
Oops, something went wrong.