Skip to content

Building the prerequisites on SCARF with Intel compilers

Vedran Novakovic edited this page Jan 20, 2017 · 34 revisions

Modules

Modules used on -cpu queue nodes:

  • gcc/6.2.0
  • intel/17.0
  • cmake/3.4.3

Modules used on -gpu queue nodes:

  • binutils/2.25
  • gcc/4.9.2
  • cuda/7.5.18
  • intel/17.0

GCC version has to be at most 4.9.x for nvcc to work.

Install the latest versions of texinfo, autoconf, automake, help2man, and libtool from source (build with Intel compilers).

METIS

Metis5 will not work with HSL!

Use Metis4 instead.

Change CC in Makefile.in to icc. Then, make, and ln -s . lib to be forward compatible with Metis5 install. Also, symlink build directory to sw/metis (as seen in the link line of SpLLT below).

When/if Metis5 is supported:

make config prefix=$METISDIR cc=icc openmp=1
make && make install

MT-METIS

Not needed, but:

./configure --prefix=$MTMETISDIR --cc=icc
make && make install

MAGMA

MAGMA GPU 2.1.0.

To avoid this and similar errors:

icpc  -qopenmp -Wl,-rpath,../lib \
	-o testing/testing_zhemv testing/testing_zhemv.o \
	-L./lib -lmagma \
	-L./testing -ltest \
	-L./testing/lin -llapacktest \
	-L/apps/cuda/7.5.18/lib64 -L/apps/intel/2017/compilers_and_libraries_2017/linux/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lpthread -lstdc++ -lm -lcublas -lcusparse -lcudart
./testing/libtest.a(magma_zutil.o): In function `safe_lapackf77_zlanhe':
magma_zutil.cpp:(.text+0xc91): undefined reference to `magma_get_lapack_numthreads'
magma_zutil.cpp:(.text+0xc9f): undefined reference to `magma_set_lapack_numthreads'
magma_zutil.cpp:(.text+0xcc4): undefined reference to `magma_set_lapack_numthreads'
make: *** [testing/testing_zhemv] Error 1

change the Makefile as below:

--- magma-2.1.0/Makefile	2016-08-30 14:39:50.000000000 +0100
+++ Makefile	2016-11-15 09:50:28.182634000 +0000
@@ -40,7 +40,7 @@
 # Extension for object files: o for unix, obj for Windows?
 o_ext      ?= o
 
-prefix     ?= /usr/local/magma
+prefix     ?= $(MAGMADIR)
 
 
 
@@ -74,7 +74,7 @@
     GPU_TARGET += sm20
 endif
 ifneq ($(findstring Kepler, $(GPU_TARGET)),)
-    GPU_TARGET += sm30 sm35
+    GPU_TARGET += sm35 # sm30
 endif
 ifneq ($(findstring Maxwell, $(GPU_TARGET)),)
     GPU_TARGET += sm50
@@ -609,9 +609,9 @@
 $(testers): %: %.$(o_ext)
 	$(CXX) $(LDFLAGS) $(RPATH) \
 	-o $@ $< \
-	-L./lib -lmagma \
 	-L./testing -ltest \
 	-L./testing/lin -llapacktest \
+	-L./lib -lmagma \
 	$(LIBS)
 
 # link Fortran testing_foo from testing_foo.o

Avoid compiling for Fermi architecture and modernize the flags:

--- magma-2.1.0/make.inc.mkl-icc	2016-08-30 14:37:48.000000000 +0100
+++ make.inc.mkl-icc	2016-11-08 16:40:40.244849000 +0000
@@ -16,7 +16,7 @@
 # Note that NVIDIA no longer supports 1.x cards, as of CUDA 6.5.
 # See http://developer.nvidia.com/cuda-gpus
 #
-#GPU_TARGET ?= Fermi Kepler
+GPU_TARGET ?= Kepler
 
 # --------------------
 # programs
@@ -36,13 +36,13 @@
 
 # Use -fPIC to make shared (.so) and static (.a) library;
 # can be commented out if making only static library.
-FPIC      = -fPIC
+#FPIC      = -fPIC
 
-CFLAGS    = -O3 $(FPIC) -openmp -DADD_ -Wall -Wshadow -DMAGMA_WITH_MKL
+CFLAGS    = -O3 $(FPIC) -qopenmp -DADD_ -Wall -Wshadow -DMAGMA_WITH_MKL
 FFLAGS    = -O3 $(FPIC)         -DADD_ -warn all -warn nounused -nogen-interfaces
 F90FLAGS  = -O3 $(FPIC)         -DADD_ -warn all -warn nounused
 NVCCFLAGS = -O3                 -DADD_ -Xcompiler "$(FPIC) -Wall -Wno-unused-function"
-LDFLAGS   =     $(FPIC) -openmp
+LDFLAGS   =     $(FPIC) -qopenmp
 
 # C++11 (icc >= 13) is not required, but has benefits like atomic operations
 CXXFLAGS := $(CFLAGS) -std=c++11

Some strange errors (those for ngpu >= 2 removed, since the test machine has only 1 GPU):

****************************************************************************************************
summary
****************************************************************************************************
130348 tests in 1427 commands passed
 1031 tests failed accuracy test
    8 errors detected (crashes, CUDA errors, etc.)
routines with failures:
    testing_cgeev -RV -LV -c
    testing_cgeqlf -c
    testing_cgeqr2x_gpu --version 2 -c
    testing_cgeqr2x_gpu --version 4 -c
    testing_cgeqrf -c2
    testing_cgeqrf_gpu --version 2 -c2
    testing_cgesdd --jobu a -c
    testing_cgesdd --jobu n -c
    testing_cgesdd --jobu o -c
    testing_cgesdd --jobu s -c
    testing_cgesv_rbt -c
    testing_cheevd --version 3 --fraction 1.0 -L -JN -c
    testing_cheevd --version 3 --fraction 1.0 -L -JV -c
    testing_cheevd --version 3 --fraction 1.0 -U -JN -c
    testing_cheevd --version 3 --fraction 1.0 -U -JV -c
    testing_cheevd_gpu --version 3 --fraction 1.0 -L -JN -c
    testing_cheevd_gpu --version 3 --fraction 1.0 -L -JV -c
    testing_cheevd_gpu --version 3 --fraction 1.0 -U -JN -c
    testing_cheevd_gpu --version 3 --fraction 1.0 -U -JV -c
    testing_chegvd -L -JV --itype 3 -c
    testing_chegvd -U -JV --itype 3 -c
    testing_chegvdx --version 1 -L -JV --itype 3 -c
    testing_chegvdx --version 1 -U -JV --itype 3 -c
    testing_chegvdx --version 2 -L -JV --itype 3 -c
    testing_chegvdx --version 2 -U -JV --itype 3 -c
    testing_chegvdx --version 3 -L -JV --itype 3 -c
    testing_chegvdx --version 3 -U -JV --itype 2 -c
    testing_chegvdx --version 3 -U -JV --itype 3 -c
    testing_chegvdx_2stage -L -JV --itype 2 -c
    testing_chegvdx_2stage -L -JV --itype 3 -c
    testing_clange -c
    testing_clanhe -c
    testing_clarfb_gpu --version 1 -c
    testing_clarfb_gpu --version 2 -c
    testing_cnan_inf -c
    testing_ctrmv -L -C -DN -c
    testing_ctrmv -L -C -DU -c
    testing_ctrmv -L -DN -c
    testing_ctrmv -L -DU -c
    testing_ctrmv -U -C -DN -c
    testing_ctrmv -U -C -DU -c
    testing_ctrmv -U -DN -c
    testing_ctrmv -U -DU -c
    testing_dgeqlf -c
    testing_dgeqr2_gpu -c
    testing_dgeqr2x_gpu --version 2 -c
    testing_dgeqr2x_gpu --version 4 -c
    testing_dgeqrf -c2
    testing_dgeqrf_gpu --version 2 -c2
    testing_dlange -c
    testing_dlansy -c
    testing_dlarfb_gpu --version 1 -c
    testing_dlarfb_gpu --version 2 -c
    testing_dsygvd -U -JV --itype 2 -c
    testing_dsygvd -U -JV --itype 3 -c
    testing_dsygvdx --version 1 -U -JV --itype 2 -c
    testing_dsygvdx --version 1 -U -JV --itype 3 -c
    testing_dtrmv -L -C -DN -c
    testing_dtrmv -L -C -DU -c
    testing_dtrmv -U -C -DN -c
    testing_dtrmv -U -C -DU -c
    testing_dtrsm -SL -L -C -DU -c
    testing_dtrsm -SL -L -DN -c
    testing_dtrsm -SL -U -DU -c
    testing_dtrsm -SR -L -DU -c
    testing_dtrsm -SR -U -C -DU -c
    testing_sgegqr_gpu --version 4 -c
    testing_sgeqlf -c
    testing_sgeqr2x_gpu --version 1 -c
    testing_sgeqr2x_gpu --version 2 -c
    testing_sgeqr2x_gpu --version 3 -c
    testing_sgeqr2x_gpu --version 4 -c
    testing_sgesv_rbt -c
    testing_slange -c
    testing_slansy -c
    testing_slarfb_gpu --version 1 -c
    testing_slarfb_gpu --version 2 -c
    testing_snan_inf -c
    testing_ssygvd -L -JV --itype 2 -c
    testing_ssygvd -L -JV --itype 3 -c
    testing_ssygvd -U -JV --itype 2 -c
    testing_ssygvd -U -JV --itype 3 -c
    testing_ssygvdx --version 1 -L -JV --itype 2 -c
    testing_ssygvdx --version 1 -L -JV --itype 3 -c
    testing_ssygvdx --version 1 -U -JV --itype 2 -c
    testing_ssygvdx --version 1 -U -JV --itype 3 -c
    testing_ssygvdx_2stage -L -JV --itype 2 -c
    testing_ssygvdx_2stage -L -JV --itype 3 -c
    testing_strmv -L -C -DN -c
    testing_strmv -L -C -DU -c
    testing_strmv -L -DU -c
    testing_strmv -U -C -DN -c
    testing_strmv -U -C -DU -c
    testing_strsm -SL -L -C -DU -c
    testing_strsm -SL -L -DN -c
    testing_strsm -SL -U -C -DN -c
    testing_strsm -SL -U -DU -c
    testing_strsm -SR -L -C -DN -c
    testing_strsm -SR -L -DU -c
    testing_strsm -SR -U -C -DU -c
    testing_strsm -SR -U -DN -c
    testing_zgeev -RV -LV -c
    testing_zgemm -l -CC -c
    testing_zgemm -l -CN -c
    testing_zgemm -l -NC -c
    testing_zgemm -l -NN -c
    testing_zgeqlf -c
    testing_zgeqr2x_gpu --version 1 -c
    testing_zgeqr2x_gpu --version 2 -c
    testing_zgeqr2x_gpu --version 3 -c
    testing_zgeqr2x_gpu --version 4 -c
    testing_zgesdd --jobu a -c
    testing_zgesdd --jobu n -c
    testing_zgesdd --jobu o -c
    testing_zgesdd --jobu s -c
    testing_zgesv_rbt -c
    testing_zheevd --version 3 --fraction 1.0 -L -JN -c
    testing_zheevd --version 3 --fraction 1.0 -L -JV -c
    testing_zheevd --version 3 --fraction 1.0 -U -JN -c
    testing_zheevd --version 3 --fraction 1.0 -U -JV -c
    testing_zheevd_gpu --version 3 --fraction 1.0 -L -JN -c
    testing_zheevd_gpu --version 3 --fraction 1.0 -L -JV -c
    testing_zheevd_gpu --version 3 --fraction 1.0 -U -JN -c
    testing_zheevd_gpu --version 3 --fraction 1.0 -U -JV -c
    testing_zhegvd -L -JV --itype 2 -c
    testing_zhegvd -L -JV --itype 3 -c
    testing_zhegvdx --version 1 -L -JV --itype 2 -c
    testing_zhegvdx --version 1 -L -JV --itype 3 -c
    testing_zhegvdx --version 1 -U -JV --itype 3 -c
    testing_zhegvdx --version 2 -L -JV --itype 3 -c
    testing_zhegvdx --version 2 -U -JV --itype 2 -c
    testing_zhegvdx --version 2 -U -JV --itype 3 -c
    testing_zhegvdx --version 3 -L -JV --itype 3 -c
    testing_zhegvdx --version 3 -U -JV --itype 2 -c
    testing_zhegvdx --version 3 -U -JV --itype 3 -c
    testing_zhegvdx_2stage -L -JV --itype 2 -c
    testing_zhegvdx_2stage -L -JV --itype 3 -c
    testing_zlange -c
    testing_zlarfb_gpu --version 1 -c
    testing_zlarfb_gpu --version 2 -c
    testing_ztrmv -L -C -DN -c
    testing_ztrmv -L -C -DU -c
    testing_ztrmv -L -DN -c
    testing_ztrmv -L -DU -c
    testing_ztrmv -U -C -DN -c
    testing_ztrmv -U -C -DU -c
    testing_ztrmv -U -DN -c
    testing_ztrmv -U -DU -c

HWLOC

Don't build Git sources!

Use a tarball instead, and:

CC=icc CXX=icpc ./configure --prefix=$HWLOCDIR --enable-dependency-tracking --disable-shared --enable-static --disable-cairo --disable-libxml2 --with-gnu-ld
make && make install

With make check: CUDA test segfaulted.

GTG

autoreconf -isv
CC=icc F77=ifort ./configure --prefix=$GTGDIR --enable-dependency-tracking --disable-shared --with-gnu-ld --with-fortran
make && make install

SPRAL

GCC: Hack configure.ac for HWLOC support, then:

CC=gcc F77=gfortran FC=gfortran CXX=g++ CPPFLAGS="-I$HWLOCDIR/include" LDFLAGS="-L$HWLOCDIR/lib" LIBS="-lhwloc -lnvidia-ml" NVCCFLAGS="-arch=sm_35" ./configure --prefix=$SPRALDIR --enable-dependency-tracking --with-blas="-L$MKLROOT -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core" --with-lapack="-L$MKLROOT -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core" --with-metis="-L$METISDIR/lib -lmetis"

For Intel Fortran and OpenMP support, change Makefile.am as below:

--- spral/Makefile.am	2016-10-20 16:30:21.000000000 +0100
+++ Makefile.am	2016-11-09 12:58:09.994758000 +0000
@@ -6,11 +6,11 @@
 # NVCC setup
 PTX_FLAGS = -v
 #NVCCFLAGS = -Iinclude -arch=sm_20 -g -Xptxas="${PTX_FLAGS}"
-OPENMP_LIB = -lgomp # FIXME: autoconf this
+OPENMP_LIB = -liomp5 # FIXME: autoconf this
 AM_NVCC_FLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/src
 AM_LD_FLAGS = -lcuda
 NVCCLINK = \
-	$(NVCC) $(NVCCFLAGS) $(AM_NVCC_FLAGS) $(AM_LD_FLAGS) $(LDFLAGS) $(OPENMP_LIB) -o $@
+	$(NVCC) $(NVCCFLAGS) $(AM_NVCC_FLAGS) $(AM_LD_FLAGS) $(LDFLAGS) $(OPENMP_LIB) /apps/intel/2017/compilers_and_libraries/linux/lib/intel64_lin/for_main.o -o $@
 .cu.o:
 	$(NVCC) $(NVCCFLAGS) $(AM_NVCC_FLAGS) -dc -o $@ $<
 

Note, there is Fortran main (for_main.o) added to the nvcc link line.

TODO: figure out how to enable GTG.

./autogen.sh
HWLOC_CFLAGS="-I$HWLOCDIR/include" HWLOC_LIBS="-L$HWLOCDIR/lib -lhwloc" NVCCFLAGS="-g -arch=sm_35" CC=icc CXX=icpc F77=ifort FC=ifort ./configure --prefix=$SPRALDIR --enable-dependency-tracking --with-blas="-L$MKLROOT -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread" --with-lapack="-L$MKLROOT -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread" --with-metis="-L$METISDIR/lib -lmetis"
make

For now, make check fails as below:

nvcc -g -arch=sm_35 -I./include -I./src -lcuda  -liomp5  /apps/intel/2017/compilers_and_libraries/linux/lib/intel64_lin/for_main.o -o ssids_test tests/ssids/ssids.o -L. -lspral -L/home/cseg/scarf366/NLAFET/sw/metis/lib -lmetis -L/apps/intel/2017/compilers_and_libraries_2017/linux/mkl -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -L/apps/intel/2017/compilers_and_libraries_2017/linux/mkl -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread  -L/home/cseg/scarf366/NLAFET/sw/hwloc/lib -lhwloc -L/apps/intel/2017/compilers_and_libraries_2017.0.098/linux/compiler/lib/intel64 -L/apps/intel/2017/compilers_and_libraries_2017.0.098/linux/ipp/../compiler/lib/intel64 -L/apps/intel/2017/ipp/lib/intel64 -L/apps/intel/2017/compilers_and_libraries_2017.0.098/linux/compiler/lib/intel64_lin -L/apps/intel/2017/mkl/lib/intel64 -L/apps/intel/2017/tbb/lib/intel64_lin/gcc4.7 -L/apps/gcc/4.9.2/lib/gcc/x86_64-unknown-linux-gnu/4.9.2/ -L/apps/gcc/4.9.2/lib/gcc/x86_64-unknown-linux-gnu/4.9.2/../../../../lib64 -L/apps/gcc/4.9.2/lib/gcc/x86_64-unknown-linux-gnu/4.9.2/../../../../lib64/ -L/lib/../lib64 -L/lib/../lib64/ -L/usr/lib/../lib64 -L/usr/lib/../lib64/ -L/apps/intel/2017/compilers_and_libraries_2017.0.098/linux/compiler/lib/intel64/ -L/apps/intel/2017/compilers_and_libraries_2017.0.098/linux/ipp/../compiler/lib/intel64/ -L/apps/intel/2017/ipp/lib/intel64/ -L/apps/intel/2017/compilers_and_libraries_2017.0.098/linux/compiler/lib/intel64_lin/ -L/apps/intel/2017/mkl/lib/intel64/ -L/apps/intel/2017/tbb/lib/intel64_lin/gcc4.7/ -L/apps/gcc/4.9.2/lib/gcc/x86_64-unknown-linux-gnu/4.9.2/../../../ -L/lib64 -L/lib/ -L/usr/lib64 -L/usr/lib -lifport -lifcoremt -limf -lsvml -lm -lipgo -lirc -lpthread -lirc_s -ldl -lrt -lcublas 
tests/ssids/ssids.o: In function `chk_answer':
/home/cseg/scarf366/NLAFET/ralna/spral-git/tests/ssids/ssids.f90:1660: undefined reference to `ssids_factor_solve_'
collect2: error: ld returned 1 exit status
make[1]: *** [ssids_test] Error 1

Just remove ssids_test$(EXEEXT) from check_PROGRAMS and TESTS in the Makefile, and re-run make check. It will fail with

nvcc fatal   : Unknown option 'nofor_main'
make[1]: *** [examples/C/ssids] Error 1

Then, comment out NO_FORT_MAIN and remove /apps/intel/2017/compilers_and_libraries/linux/lib/intel64_lin/for_main.o from the Makefile, and re-run make check. All tests should pass. Finally, run make install.

FxT

./bootstrap
CC=icc ./configure --prefix=$FXTDIR --enable-dependency-tracking --disable-shared
make && make check && make install

StarPU

Patch configure.ac as follows:

--- starpu/configure.ac	2016-11-04 14:24:19.000000000 +0000
+++ configure.ac	2016-11-14 18:09:47.209626000 +0000
@@ -1772,7 +1772,7 @@
        if test x$enable_simgrid = xyes ; then
            DEFAULT_MPICC=smpicc
        else
-           DEFAULT_MPICC=mpicc
+           DEFAULT_MPICC=mpiicc
        fi
        # nothing was specified: default value is used
        AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
@@ -1822,7 +1822,7 @@
        if test x$enable_simgrid = xyes ; then
            DEFAULT_MPICXX=smpicxx
        else
-           DEFAULT_MPICXX=mpicxx
+           DEFAULT_MPICXX=mpiicpc
        fi
        # nothing was specified: default value is used
        AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
@@ -1855,7 +1855,7 @@
         # On Darwin (and maybe other systems ?) the linker will fail (undefined
         # references to MPI_*). We manually add the required flags to fix this
         # issue.
-        AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
+        #AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
 else
     cc_or_mpicc=$CC
 fi
@@ -2362,7 +2362,7 @@
 					      if test x$enable_simgrid = xyes ; then
 						      DEFAULT_MPIFORT=smpifort
 					      else
-						      DEFAULT_MPIFORT=mpif90
+						      DEFAULT_MPIFORT=mpiifort
 					      fi
 					      # nothing was specified: default value is used
 					      AC_PATH_PROG(mpifort_path, $DEFAULT_MPIFORT, [no], [$(dirname $mpicc_path):$simgrid_dir/bin:$PATH])

Also, in min-dgels, fix Makefile.in and base/make.inc to use icc instead of gcc.

Then, configure StarPU:

./autogen.sh
CC=icc CXX=icpc F77=ifort FC=ifort MAGMA_CFLAGS="-DADD_ -I$MAGMADIR/include" MAGMA_LIBS="-L$MAGMADIR/lib -lmagma_sparse -lmagma -lcusparse" FXT_CFLAGS="-I$FXTDIR/include" FXT_LIBS="-lfxt" FXT_LDFLAGS="-L$FXTDIR/lib" HWLOC_CFLAGS="-I$HWLOCDIR/include" HWLOC_LIBS="-L$HWLOCDIR/lib -lhwloc" ./configure --prefix=$STARPUDIR --enable-dependency-tracking --disable-shared --enable-long-check --enable-new-check --disable-fstack-protector-all --enable-fast --enable-paje-codelet-details --enable-fxt-lock --enable-memory-stats --disable-opencl --enable-mpi-master-slave --enable-mpi-check --enable-mpi-progression-hook --enable-openmp --enable-blas-lib=mkl --with-gnu-ld --with-cuda-dir=$CUDADIR --with-fxt=$FXTDIR --with-mkl-cflags="-I$MKLROOT/include" --with-mkl-ldflags="-L$MKLROOT -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread"

The final output should be:

configure:

	CPUs   enabled: yes
	CUDA   enabled: yes
	OpenCL enabled: no
	SCC    enabled: no
	MIC    enabled: no

	Compile-time limits
	(change these with --enable-maxcpus, --enable-maxcudadev,
	--enable-maxopencldev, --enable-maxmicdev, --enable-maxnodes,
        --enable-maxbuffers)
        (Note these numbers do not represent the number of detected
	devices, but the maximum number of devices StarPU can manage)

	Maximum number of CPUs:                     64
	Maximum number of CUDA devices:             4
	Maximum number of OpenCL devices:           0
	Maximum number of SCC devices:              0
	Maximum number of MIC threads:              0
	Maximum number of MPI master-slave devices: 4
	Maximum number of memory nodes:             16
	Maximum number of task buffers:             8

	GPU-GPU transfers: yes
	Allocation cache:  yes

	Magma enabled:     yes
	BLAS library:      mkl
	hwloc:             yes
	FxT trace enabled: yes
	StarPU-Top:        no

        Documentation:     no
        Examples:          yes

	StarPU Extensions:
	       StarPU MPI enabled:                          yes
	       MPI test suite:                              yes
	       Master-Slave MPI enabled:                    yes
	       FFT Support:                                 yes
	       GCC plug-in:                                 no
	       GCC plug-in test suite (requires GNU Guile): no
	       OpenMP runtime support enabled:              yes
	       SOCL enabled:                                no
               SOCL test suite:                             no
               Scheduler Hypervisor:                        no
               simgrid enabled:                             no
               ayudame enabled:                             no
	       Native fortran support:                      yes
	       Native MPI fortran support:                  yes

Then, run make -j and make install (optionally, make check as well).

If no CUDA, MAGMA, FxT, OpenMP, etc., are required, the configure line is:

CC=icc CXX=icpc F77=ifort FC=ifort HWLOC_CFLAGS="-I$HWLOCDIR/include" HWLOC_LIBS="-L$HWLOCDIR/lib -lhwloc" ./configure --prefix=$STARPUDIR-cpu --enable-dependency-tracking --disable-shared --disable-fstack-protector-all --enable-fast --disable-opencl --disable-openmp --enable-blas-lib=mkl --disable-cuda --without-fxt --with-mkl-cflags="-I$MKLROOT/include" --with-mkl-ldflags="-L$MKLROOT -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread" --disable-mpi --disable-export-dynamic --without-pic

PLASMA

Fix make.inc:

--- plasma/make.inc.mkl-gcc	2016-11-15 15:38:11.485809000 +0000
+++ make.inc.mkl-gcc	2016-11-15 15:46:37.310727000 +0000
@@ -7,7 +7,7 @@
 # --------------------
 # programs
 
-CC        = gcc
+CC        = icc
 
 ARCH      = ar
 ARCHFLAGS = cr
@@ -19,10 +19,10 @@
 
 # Use -fPIC to make shared (.so) and static (.a) libraries;
 # can be commented out if making only static libraries.
-FPIC      = -fPIC
+#FPIC      = -fPIC
 
-CFLAGS    = -fopenmp $(FPIC) -O3 -std=c99 -Wall -pedantic -Wshadow -Wno-unused-function
-LDFLAGS   = -fopenmp $(FPIC)
+CFLAGS    = -qopenmp $(FPIC) -O3 -std=c99 -Wall -Wshadow -Wno-unused-function
+LDFLAGS   = -qopenmp $(FPIC)
 
 # options for MKL
 CFLAGS   += -DPLASMA_WITH_MKL \
@@ -39,6 +39,6 @@
 
 # With gcc OpenMP (libgomp), use -lmkl_sequential or (-lmkl_gnu_thread   with MKL_NUM_THREADS=1).
 # With icc OpenMP (liomp5),  use -lmkl_sequential or (-lmkl_intel_thread with MKL_NUM_THREADS=1).
-LIBS      = -L$(MKLROOT)/lib -lmkl_intel_lp64 -lmkl_core -lmkl_sequential -lm
+LIBS      = -L$(MKLROOT)/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lm
 
 INC       = -I$(MKLROOT)/include

Fix Makefile:

--- plasma/Makefile	2016-11-15 15:38:09.749642000 +0000
+++ Makefile	2016-11-15 15:43:00.177155000 +0000
@@ -36,7 +36,7 @@
 INC       ?= -I$(LAPACKDIR)/LAPACKE/include -I$(CBLASDIR)/include
 LIBS      ?= -L$(LAPACKDIR) -llapack -llapacke -L$(CBLASDIR)/lib -lcblas -lblas
 
-prefix    ?= /usr/local/plasma
+prefix    ?= $(PLASMADIR)
 
 
 # ----------------------------------------

PARSEC

Built without GTG, profiling/statistics, and CUDA. With CUDA enabled, there is an error in build:

[ 37%] Building C object data_dist/matrix/CMakeFiles/dague_distribution_matrix.dir/reduce_col.c.o
/home/cseg/scarf366/NLAFET/deps/build/data_dist/matrix/reduce_col.c(910): error: identifier "gRtop" is undefined
      if ( NULL != gRtop ) {
                   ^

compilation aborted for /home/cseg/scarf366/NLAFET/deps/build/data_dist/matrix/reduce_col.c (code 2)
make[2]: *** [data_dist/matrix/CMakeFiles/dague_distribution_matrix.dir/reduce_col.c.o] Error 2
make[1]: *** [data_dist/matrix/CMakeFiles/dague_distribution_matrix.dir/all] Error 2
make: *** [all] Error 2

SpLLT

SpLLT built with StarPU runtime Intel-threaded MKL, and MAGMA.

Original CMakeFiles/spllt_test.dir/link.txt hacked into:

/apps/intel/2017/compilers_and_libraries/linux/bin/intel64/ifort -qopenmp -O2 -g CMakeFiles/spllt_test.dir/spllt_test.F90.o -o spllt_test -L/apps/intel/2017/compilers_and_libraries_2017/linux/mkl/lib/intel64 src/libspllt.a /home/cseg/scarf366/NLAFET/sw/starpu/lib/libstarpu-1.3.a /home/cseg/scarf366/NLAFET/sw/FxT/lib/libfxt.a /home/cseg/scarf366/NLAFET/sw/hwloc/lib/libhwloc.a /home/cseg/scarf366/NLAFET/deps/magma-2.1.0/lib/libmagma.a src/libspllt_cuda.a /home/cseg/scarf366/NLAFET/sw/metis/lib/libmetis.a /home/cseg/scarf366/NLAFET/ralna/spral-git/libspral.a -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lirng -ldecimal -lcilkrts -lstdc++ /apps/cuda/7.5.18/lib64/libcusparse_static.a /apps/cuda/7.5.18/lib64/libcublas_static.a /apps/cuda/7.5.18/lib64/libculibos.a /apps/cuda/7.5.18/lib64/libcudart_static.a -Wl,-rpath,/apps/intel/2017/compilers_and_libraries_2017/linux/mkl/lib/intel64 /usr/lib64/libcuda.so /usr/lib64/libpthread.so /usr/lib64/libdl.so /usr/lib64/librt.so

StarPU needs FxT for tracing, and CUDA and runtime dependencies had to be satisfied manually.