fmihpc · markusbattarbee · Apr 23, 2024 · Apr 25, 2024 · Apr 25, 2024 · Apr 25, 2024
diff --git a/.github/workflows/github-ci.yml b/.github/workflows/github-ci.yml
@@ -182,7 +182,6 @@ jobs:
         path: vlasiator
         if-no-files-found: error
 
-
   build_testpackage:
     # Build Vlasiator with testpackage flags, on the carrington cluster
     # (for subsequent running of the integration test package)
@@ -228,6 +227,46 @@ jobs:
     #    name: Testpackage build log
     #    path: build.log
 
+  build_testpackage_ukkoGPU:
+    # Build Vlasiator with testpackage flags, on the ukko cluster using
+    # it's nvidia gpus
+    runs-on: carrington
+
+    steps:
+      - name: Clean workspace
+        run: |
+          RUN_STRING=$( cat << MORO
+          rm -rf libraries library-build testpackage
+          rm -f libraries.tar.zst testpackage_check_description.txt testpackage-output.tar.gz metrics.txt stdout.txt stderr.txt testpackage_output_variables.txt
+          rm -f *.xml
+          MORO
+          )
+          srun -M ukko bash -c "$RUN_STRING"
+      - name: Checkout source
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Make clean
+        run: VLASIATOR_ARCH=ukko_cuda make clean
+      - uses: ursg/gcc-problem-matcher@master
+      - name: Compile vlasiator (Testpackage build w/ CUDA)
+        run: |
+          export VLASIATOR_ARCH=ukko_cuda
+          srun -Mukko -pgpu-oversub --cpus-per-gpu=8 --mem-per-gpu=20G --job-name CI_tp_compile --interactive --nodes=1 -n 1 -c 16 -t 1:00:0 bash -c 'module purge; ml GCC/11.2.0; ml OpenMPI/4.1.1-GCC-11.2.0; ml PMIx/4.1.0-GCCcore-11.2.0; ml PAPI/6.0.0.1-GCCcore-11.2.0; ml CUDA; ml Boost/1.55.0-GCC-11.2.0; export VLASIATOR_ARCH=ukko_cuda; make -j 9 testpackage; sleep 10s'
+      - name: Make sure the output binary is visible in lustre
+        uses: nick-fields/retry@v3
+        with:
+          timeout_seconds: 15
+          max_attempts: 3
+          retry_on: error
+          command: ls vlasiator
+      - name: Upload testpackage binary
+        uses: actions/upload-artifact@v4
+        with:
+          name: vlasiator-testpackage-gpu
+          path: vlasiator
+          if-no-files-found: error
+
   build_riscv:
     runs-on: risc-v
     needs: build_libraries_riscv
@@ -394,6 +433,68 @@ jobs:
       # Note: Testpackage output is further processed in the pr_report.yml workflow
       # (to produce Checks against pull requests)
 
+  run_testpackage_gpu:
+    # Run the testpackage on the carrington cluster
+    runs-on: carrington
+    needs: [build_testpackage_ukkoGPU, build_tools]
+    continue-on-error: true
+
+    steps:
+    - name: Checkout source
+      uses: actions/checkout@v4
+      with:
+        submodules: false
+    - name: Download testpackage binary
+      uses: actions/download-artifact@v4
+      with:
+        name: vlasiator-testpackage-gpu
+    - name: Download tools
+      uses: actions/download-artifact@v4
+      with:
+        name: vlasiator-tools
+    - name: Run testpackage
+      id: run
+      run: |
+        chmod +x $GITHUB_WORKSPACE/vlasiator
+        chmod +x $GITHUB_WORKSPACE/vlsv*_DP
+        cd testpackage
+        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GITHUB_WORKSPACE/libraries/lib
+        sbatch -W -o testpackage_run_output.txt ./small_test_ukko_gpu_github_ci.sh
+        PARSE_OUTPUT_CMD=$( cat << MORO
+        echo "Job finished, checking output."
+        cat testpackage_run_output.txt
+        cat $GITHUB_STEP_SUMMARY > $GITHUB_WORKSPACE/testpackage_check_description.txt
+        cd $GITHUB_WORKSPACE
+        ls -halB testpackage_check_description.txt
+        tar -czf testpackage-output.tar.gz testpackage_check_description.txt testpackage_output_variables.txt
+        MORO
+        )
+        srun --job-name CI_package_results -M ukko -N 1 -c 1 --mem=1G bash -c "$PARSE_OUTPUT_CMD"
+        if [ -f $GITHUB_WORKSPACE/testpackage_failed ]; then
+          # Fail this step if any test failed.
+          exit 1
+        fi
+    - name: Scancel dangling job upon cancellation
+      if: cancelled()
+      run: |
+        # Try accessing the job id echoed by the job script.
+        scancel ${{ steps.run.outputs.SLURM_JOB_ID }}
+    - name: Make sure the output tarball is visible in lustre
+      uses: nick-fields/retry@v3
+      with:
+        timeout_seconds: 15
+        max_attempts: 3
+        retry_on: error
+        command: ls $GITHUB_WORKSPACE/testpackage-output-gpu.tar.gz
+    - name: Upload testpackage output
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: testpackage-output-gpu
+        path: testpackage-output-gpu.tar.gz
+      # Note: Testpackage output is further processed in the pr_report.yml workflow
+      # (to produce Checks against pull requests)
+
   build_ionosphereTests:
     # Build IonosphereSolverTests miniApp
     runs-on: ubuntu-latest

diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,10 @@ diagnostic.txt
 *.vlsv
 *.silo
 *.o
+*.ll
+*.gpu
+*.ptx
+*.s
 *.d
 vscode/
 .vscode/

diff --git a/.gitmodules b/.gitmodules
@@ -19,3 +19,7 @@
 [submodule "submodules/vectorclass-addon"]
 	path = submodules/vectorclass-addon
 	url = https://github.com/vectorclass/add-on
+
+[submodule "submodules/hashinator"]
+	path = submodules/hashinator
+	url = https://github.com/fmihpc/hashinator.git
diff --git a/MAKE/Makefile.Freezer b/MAKE/Makefile.Freezer
@@ -28,6 +28,7 @@ endif
 # MATHFLAGS are for special math etc. flags, these are only applied on solver functions
 # LDFLAGS flags for linker
 
+# -march=native -fpermissive
 CXXFLAGS += -O3 -fopenmp -funroll-loops -std=c++20 -W -Wall -Wno-unused -fabi-version=0 -mfma -mavx2 -Wno-unknown-pragmas -Wno-sign-compare
 testpackage: CXXFLAGS = -g -ggdb -O2 -fopenmp -funroll-loops -std=c++20 -fabi-version=0 -mno-avx -mno-fma -fno-unsafe-math-optimizations
 
@@ -68,3 +69,7 @@ LIB_VLSV = -L$(LIBRARY_PREFIX)/vlsv -lvlsv -Wl,-rpath=$(LIBRARY_PREFIX)/vlsv/lib
 
 LIB_PROFILE = -L$(LIBRARY_PREFIX)/phiprof/lib -lphiprof -Wl,-rpath=$(LIBRARY_PREFIX)/phiprof/lib
 INC_PROFILE = -I$(LIBRARY_PREFIX)/phiprof/include
+
+#enable nvtx on cpu mode as well
+#LIB_PROFILE = -L$(LIBRARY_PREFIX)/phiprof_nvcc/lib -lphiprof -Wl,-rpath=$(LIBRARY_PREFIX)/phiprof_nvcc/lib
+#INC_PROFILE = -I$(LIBRARY_PREFIX)/phiprof_nvcc/include
diff --git a/MAKE/Makefile.Freezer_cuda b/MAKE/Makefile.Freezer_cuda
@@ -0,0 +1,101 @@
+# Markus' desktop computer, CUDA
+# Can be used as a sample on how to generate local CUDA makefiles
+#
+# Note: CUDA versions before 11.6 will complain when compiling backgroundfields
+#  (error: parameter packs not expanded with ‘...’:)
+# this is fixed by installing at least version 11.6
+
+#======== Vectorization ==========
+#Set vector backend type for vlasov solvers, sets precision and length.
+#Options:
+# AVX:	    VEC4D_AGNER, VEC4F_AGNER, VEC8F_AGNER
+# AVX512:   VEC8D_AGNER, VEC16F_AGNER
+# Fallback: VECTORCLASS = VEC_FALLBACK_GENERIC (Defaults to VECL8)
+
+ifeq ($(DISTRIBUTION_FP_PRECISION),SPF)
+#Single-precision        
+        VECTORCLASS = VEC_FALLBACK_GENERIC
+else
+#Double-precision
+        VECTORCLASS = VEC_FALLBACK_GENERIC
+endif
+
+#===== Vector Lengths ====
+# Default for VEC_FALLBACK_GENERIC is WID=4, VECL=8
+# NOTE: A bug currently results in garbage data already on cell init if VECL is not equal to WID2
+#WID=8
+#VECL=64
+WID=4
+VECL=16
+
+#======= Compiler and compilation flags =========
+# NOTES on compiler flags:
+# CXXFLAGS is for compiler flags, they are always used
+# MATHFLAGS are for special math etc. flags, these are only applied on solver functions
+# LDFLAGS flags for linker
+
+USE_CUDA=1
+
+# Tell mpic++ to use nvcc for all compiling
+CMP = OMPI_CXX='nvcc' OMPI_CXXFLAGS='' mpic++
+
+# Now tell also the linker to use nvcc
+# These are found with  mpic++ --showme:link
+# The line below indeed uses OMPI_CXX, not OMPI_LD
+LNK = OMPI_CXX='nvcc' OMPI_CXXFLAGS='-arch=sm_60' OMPI_LIBS='-L/usr/lib/x86_64-linux-gnu/openmpi/lib' OMPI_LDFLAGS='-lmpi_cxx -lmpi' mpic++
+
+#-G (device debug) overrides --generate-line-info -line-info
+# but also requires more device-side resources to run
+# use "-Xptxas -v" for verbose output of ptx compilation
+
+# Geforce GTX 1060 6GB is compute version 61
+# https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+
+CXXFLAGS = -g -O3 -x cu -std=c++17 -Xcompiler -std=c++17 --extended-lambda --expt-relaxed-constexpr -gencode arch=compute_60,code=sm_60 -Xcompiler -fopenmp --generate-line-info -line-info -Xcompiler="-fpermissive"  --extra-device-vectorization
+testpackage: CXXFLAGS = -g -O2 -x cu -std=c++17 --extended-lambda --expt-relaxed-constexpr -gencode arch=compute_60,code=sm_60 -Xcompiler -fopenmp --generate-line-info -line-info -Xcompiler="-fpermissive"
+
+
+MATHFLAGS = --use_fast_math
+# nvcc fast_math does not assume only finite math
+testpackage: MATHFLAGS = --prec-sqrt=true --prec-div=true --ftz=false --fmad=false
+
+LDFLAGS = -O2 -g -lnvToolsExt
+LIB_MPI = -lgomp
+
+LIB_CUDA = -L/usr/local/cuda/lib64
+INC_CUDA = -isystem /usr/local/cuda/include
+
+#======== PAPI ==========
+#Add PAPI_MEM define to use papi to report memory consumption?
+#CXXFLAGS += -DPAPI_MEM
+#testpackage: CXXFLAGS += -DPAPI_MEM
+
+#======== Allocator =========
+#jemalloc is CPU only
+
+#======== Libraries ===========
+LIBRARY_PREFIX = /home/markusb/git/vlasiator-lib
+
+INC_BOOST = -isystem /usr/include/boost
+LIB_BOOST = -L/usr/include/boost -lboost_program_options
+
+INC_ZOLTAN = -isystem /usr/include/trilinos
+LIB_ZOLTAN = -I/usr/lib/x86_64-linux-gnu -ltrilinos_zoltan
+
+# INC_PAPI = -I$(LIBRARY_PREFIX)/papi/include
+# LIB_PAPI = -I$(LIBRARY_PREFIX)/papi/lib -Wl,-rpath=$(LIBRARY_PREFIX)/papi/lib
+
+INC_VLSV = -I$(LIBRARY_PREFIX)/vlsv
+LIB_VLSV = -L$(LIBRARY_PREFIX)/vlsv -lvlsv -Xlinker=-rpath=$(LIBRARY_PREFIX)/vlsv/lib
+
+LIB_PROFILE = -L$(LIBRARY_PREFIX)/phiprof/lib -lphiprof -Xlinker=-rpath=$(LIBRARY_PREFIX)/phiprof/lib
+INC_PROFILE = -I$(LIBRARY_PREFIX)/phiprof/include
+
+#======== Header-only Libraries ===========
+
+INC_EIGEN = -isystem ./submodules/eigen
+INC_DCCRG = -I./submodules/dccrg
+INC_FSGRID = -I./submodules/fsgrid
+INC_HASHINATOR = -isystem ./submodules/hashinator/
+# Vectorclass only for CPU mode
+# INC_VECTORCLASS = -I ./submodules/vectorclass/ -I ./submodules/vectorclass-addon/vector3d/
diff --git a/MAKE/Makefile.appleM1 b/MAKE/Makefile.appleM1
@@ -7,20 +7,22 @@ LNK = mpic++
 #Options: 
 # AVX:	    VEC4D_AGNER, VEC4F_AGNER, VEC8F_AGNER
 # AVX512:   VEC8D_AGNER, VEC16F_AGNER
-# Fallback: VEC4D_FALLBACK, VEC4F_FALLBACK, VEC8F_FALLBACK
+# AVX512:   VEC8D_AGNER, VEC16F_AGNER
+# Fallback: VEC_FALLBACK_GENERIC
 
 ifeq ($(DISTRIBUTION_FP_PRECISION),SPF)
 #Single-precision        
-	#VECTORCLASS = VEC_FALLBACK_GENERIC
-	#VECTORCLASS = VEC8F_AGNER
-	VECTORCLASS = VEC8F_FALLBACK
+        VECTORCLASS = VEC_FALLBACK_GENERIC
 else
 #Double-precision
-	#VECTORCLASS = VEC4D_AGNER
-	#VECTORCLASS = VEC_FALLBACK_GENERIC
-	VECTORCLASS = VEC8D_FALLBACK
+        VECTORCLASS = VEC_FALLBACK_GENERIC
 endif
 
+#===== Vector Lengths ====
+# Default for VEC_FALLBACK_GENERIC is WID=4, VECL=8
+WID=4
+VECL=8
+
 #======== PAPI ==========
 #Add PAPI_MEM define to use papi to report memory consumption?
 #CXXFLAGS +=  -DPAPI_MEM   # Papi does not work on MacOS, see https://stackoverflow.com/questions/69531604/installing-papi-on-macos

diff --git a/MAKE/Makefile.arriesgado b/MAKE/Makefile.arriesgado
@@ -14,20 +14,21 @@ LNK = mpic++
 #Options: 
 # AVX:      VEC4D_AGNER, VEC4F_AGNER, VEC8F_AGNER
 # AVX512:   VEC8D_AGNER, VEC16F_AGNER
-# Fallback: VEC4D_FALLBACK, VEC4F_FALLBACK, VEC8F_FALLBACK
+# Fallback: VEC_FALLBACK_GENERIC
 
 ifeq ($(DISTRIBUTION_FP_PRECISION),SPF)
 #Single-precision        
-        VECTORCLASS = VEC8F_FALLBACK
-        INC_VECTORCLASS = -I$(LIBRARY_PREFIX)/../vlasiator/vlasovsolver
+        VECTORCLASS = VEC_FALLBACK_GENERIC
 else
 #Double-precision
-#       VECTORCLASS = VEC4D_AGNER
-#       INC_VECTORCLASS = -I$(LIBRARY_PREFIX)/vectorclass
-        VECTORCLASS = VEC4D_FALLBACK
-        INC_VECTORCLASS = -I$(LIBRARY_PREFIX)/../vlasiator/vlasovsolver
+        VECTORCLASS = VEC_FALLBACK_GENERIC
 endif
 
+#===== Vector Lengths ====
+# Default for VEC_FALLBACK_GENERIC is WID=4, VECL=8
+WID=4
+VECL=8
+
 FLAGS =
 # note: std was c++11
 CXXFLAGS = -O1 -std=c++20 -W -Wall -pedantic -Wno-unused -Wno-unused-parameter -Wno-missing-braces  -fopenmp -march=rv64imafdc -isystem /usr/lib/gcc/riscv64-linux-gnu/11/include/

diff --git a/MAKE/Makefile.hawk_intel_mpt b/MAKE/Makefile.hawk_intel_mpt
@@ -26,7 +26,7 @@ FLAGS =
 #GNU flags:
 CC_BRAND = intel
 CC_BRAND_VERSION = 19.1.0
-# note: std was not updated to c++17
+# note: std was not updated to c++20
 CXXFLAGS += -traceback -g -O3 -qopenmp -std=c++14 -W -Wall -Wno-unused -march=core-avx2 -qopt-zmm-usage=high
 testpackage: CXXFLAGS = -g -traceback -O2 -qopenmp -std=c++14 -W -Wno-unused -march=core-avx2
 not_parallel_tools: CXXFLAGS += -march=native -mno-avx2 -mavx
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,10 @@ diagnostic.txt @@
     *.vlsv
     *.silo
     *.o
+    *.ll
+    *.gpu
+    *.ptx
+    *.s
     *.d
     vscode/
     .vscode/
@@ Expand Down @@