doc rtd user guide document using OpenMP

LBL-EESA · Apr 13, 2022 · fa2de85 · fa2de85
1 parent 425cb42
commit fa2de85
Show file tree

Hide file tree

Showing 5 changed files with 114 additions and 0 deletions.
diff --git a/doc/rtd/index.rst b/doc/rtd/index.rst
@@ -46,6 +46,8 @@ HAMR is configured with CMake. The following CMake variables influence the build
 +-------------------------+----------------------------------------------------+
 | HAMR_ENABLE_HIP         | If set to ON enables HIP features. Default OFF     |
 +-------------------------+----------------------------------------------------+
+| HAMR_ENABLE_OPENMP      | If set to ON enables OpenMP features. Default OFF  |
++-------------------------+----------------------------------------------------+
 | HAMR_ENABLE_PYTHON      | If set to ON enables Python features. Default OFF  |
 +-------------------------+----------------------------------------------------+
 | BUILD_TESTING           | If set to ON enables regression tests. Default OFF |
@@ -221,6 +223,39 @@ The source code for the following examples is located in the `doc/rtd/source`
 folder. The C++ examples include a simple Makefile that can be edited to point
 to a build.
 
+.. _hello_openmp:
+
+Hello World! w/ C++ and OpenMP
+------------------------------
+This example illustrates coupling two codes, in this case functions, using HAMR
+so that they can process data produced either on the CPU or GPU without knowing
+specifically where the data passed to them resides. C++ smart pointers are used
+to manage temporary buffers if the passed data needed to be moved to the device
+where it was accessed.  See :ref:`buffer` for more information.  See
+:ref:`hello_cuda` for a CUDA implementation of this example. See :ref:`hello_hip`
+for a HIP implementation of this example.See :ref:`hello_cupy` for a Python
+implementation of this example.
+
+.. _openmp_add_array:
+
+.. literalinclude:: source/hello_openmp/add.h
+    :language: c++
+    :linenos:
+    :caption: Code that uses HAMR to access array based data in OpenMP. Calling `get_openmp_accessible` makes the array's available in OpenMP if they are not. Then OpenMP device oofloading may be applied as usual.
+
+
+.. literalinclude:: source/hello_openmp/write.h
+    :language: c++
+    :linenos:
+    :caption: Code that uses HAMR to access array based data on the CPU. Calling `get_cpu_accessible` makes the array available on the CPU if they are not.
+
+
+.. literalinclude:: source/hello_openmp/hello_openmp.cpp
+    :language: c++
+    :linenos:
+    :caption: This simple Hello world! program allocates an array on the GPU and an array on the CPU, both are initialized to 1. Then dispatch code use HAMR API's to make sure that the data is accessible in OpenMP before launching a simple kernel that adds the two arrays. HMAR is used to make the data accessible on the CPU and print the result.
+
+
 .. _hello_cuda:
 
 Hello World! w/ C++ and CUDA

diff --git a/doc/rtd/source/hello_openmp/Makefile b/doc/rtd/source/hello_openmp/Makefile
@@ -0,0 +1,14 @@
+
+HAMR_SOURCE=../../../../
+HAMR_BUILD=../../../../build_omp
+
+# NVIDIA HPC Compiler
+#CXX=`which nvc++`
+#CXX_FLAGS=-mp=gpu -Minfo
+
+# AMD ROCm compiler
+CXX=/opt/rocm/llvm/bin/amdclang++
+CXX_FLAGS=-target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx1030
+
+all:
+	${CXX} ${CXX_FLAGS} hello_openmp.cpp -I${HAMR_SOURCE} -I${HAMR_BUILD} -std=c++14 -L${HAMR_BUILD}/lib/ -lhamr
diff --git a/doc/rtd/source/hello_openmp/add.h b/doc/rtd/source/hello_openmp/add.h
@@ -0,0 +1,24 @@
+template <typename T, typename U>
+hamr::buffer<T> add(const hamr::buffer<T> &a1, const hamr::buffer<U> &a2)
+{
+    // get pointers to the input arrays that are safe to use on the GPU
+    auto spa1 = a1.get_openmp_accessible();
+    const T *pa1 = spa1.get();
+
+    auto spa2 = a2.get_openmp_accessible();
+    const U *pa2 = spa2.get();
+
+    // allocate the memory for the result on the GPU, and get a pointer to it
+    size_t n_vals = a1.size();
+    hamr::buffer<T> ao(hamr::buffer_allocator::openmp, n_vals, T(0));
+    T *pao = ao.data();
+
+    // launch the kernel to add the arrays
+    #pragma omp target teams distribute parallel for is_device_ptr(pao, pa1, pa2)
+    for (size_t i = 0; i < n_vals; ++i)
+    {
+        pao[i] = pa1[i] + pa2[i];
+    }
+
+    return ao;
+}
diff --git a/doc/rtd/source/hello_openmp/hello_openmp.cpp b/doc/rtd/source/hello_openmp/hello_openmp.cpp
@@ -0,0 +1,26 @@
+#include <hamr_buffer.h>
+
+#include <iostream>
+#include <memory>
+
+#include "add.h"
+#include "write.h"
+
+int main(int, char **)
+{
+    size_t n_vals = 400;
+
+    // allocate and initialize to 1 on the GPU
+    hamr::buffer<float> a0(hamr::buffer_allocator::openmp, n_vals, 1.0f);
+
+    // allocate and initialize to 1 on the CPU
+    hamr::buffer<float> a1(hamr::buffer_allocator::malloc, n_vals, 1.0f);
+
+    // add the two arrays
+    hamr::buffer<float> a2 = add(a0, a1);
+
+    // write the result
+    write(std::cerr, a2);
+
+    return 0;
+}
diff --git a/doc/rtd/source/hello_openmp/write.h b/doc/rtd/source/hello_openmp/write.h
@@ -0,0 +1,15 @@
+template <typename T>
+void write(std::ostream &os, const hamr::buffer<T> &ai)
+{
+    // get pointer to the input array that is safe to use on the CPU
+    auto spai = ai.get_cpu_accessible();
+    const T *pai = spai.get();
+
+    // write the elements of the array to the stream
+    for (size_t i = 0; i < ai.size(); ++i)
+    {
+        os << pai[i] << " ";
+    }
+
+    os << std::endl;
+}