argonne-lcf · bd4 · Jun 7, 2024 · Jun 7, 2024 · Jun 9, 2024 · Jun 10, 2024
diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml
@@ -28,7 +28,8 @@ jobs:
         with:
           path: ~/babeltrace2/2.0.4
           key: ${{ runner.os }}-build-${{ env.cache-name }}
-      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler
+      - run: sudo apt-add-repository ppa:lttng/stable-2.13
+      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler
         if: steps.babeltrace2.outputs.cache-hit != 'true'
       - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
         if: steps.babeltrace2.outputs.cache-hit != 'true'
@@ -69,7 +70,8 @@ jobs:
         with:
           path: ~/babeltrace2/2.0.4
           key: ${{ runner.os }}-build-${{ env.cache-name }}
-      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
+      - run: sudo apt-add-repository ppa:lttng/stable-2.13
+      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
       - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
       - name: Load Babeltrace2
         run: |
@@ -112,7 +114,8 @@ jobs:
         with:
           path: ~/babeltrace2/2.0.4
           key: ${{ runner.os }}-build-${{ env.cache-name }}
-      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler 
+      - run: sudo apt-add-repository ppa:lttng/stable-2.13
+      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler 
       - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
       - name: Load Babeltrace2
         run: |
@@ -146,9 +149,12 @@ jobs:
         include:
           - bats_file: 'iprof'
             thapi_sync_daemon: 'fs' # Need to force, if not `THAPI_SYNC_DAEMON` will be set to empty list
+          - bats_file: 'libthapi-ctl'
+            thapi_sync_daemon: 'fs'
     env:
       THAPI_TEST_BIN: clinfo
-      THAPI_BIN_DIR: ./build/ici/bin
+      THAPI_BIN_DIR: ${{ github.workspace }}/build/ici/bin
+      THAPI_INSTALL_DIR: ${{ github.workspace }}/build/ici
       THAPI_SYNC_DAEMON: ${{ matrix.thapi_sync_daemon }}
     steps:
       - uses: actions/checkout@v4
@@ -165,12 +171,16 @@ jobs:
           name: thapi-bin
       - name: Untar THAPI
         run: tar -xvf thapi.tar
-      - run: sudo apt update; sudo apt install -y lttng-tools liblttng-ust-dev ruby ruby-dev libprotobuf-dev libpocl2 clinfo bats coreutils
+      - run: sudo apt-add-repository ppa:lttng/stable-2.13
+      - run: sudo apt update; sudo apt install -y lttng-tools liblttng-ust-dev ruby ruby-dev libprotobuf-dev libpocl2 clinfo bats coreutils liblttng-ctl-dev ocl-icd-opencl-dev jq
       - run: sudo gem install babeltrace2 opencl_ruby_ffi
       - name: Load Babeltrace2
         run: |
           echo "$HOME/babeltrace2/2.0.4/bin" >> $GITHUB_PATH
           echo "LD_LIBRARY_PATH=$HOME/babeltrace2/2.0.4/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV
+      - name: ldconfig
+        run: |
+          ldconfig -p | grep libOpenCL
       - name: integration test ${{ matrix.bats_file }}
         run: |
           bats integration_tests/${{ matrix.bats_file }}.bats
@@ -189,7 +199,8 @@ jobs:
         with:
           path: ~/babeltrace2/2.0.4
           key: ${{ runner.os }}-build-${{ env.cache-name }}
-      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
+      - run: sudo apt-add-repository ppa:lttng/stable-2.13
+      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
       - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
       - name: Load Babeltrace2
         run: |
@@ -227,7 +238,8 @@ jobs:
         with:
           path: ~/babeltrace2/2.0.4
           key: ${{ runner.os }}-build-${{ env.cache-name }}
-      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
+      - run: sudo apt-add-repository ppa:lttng/stable-2.13
+      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
       - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
       - name: Load Babeltrace2
         run: |
@@ -259,7 +271,8 @@ jobs:
         with:
           path: ~/babeltrace2/2.0.4
           key: ${{ runner.os }}-build-${{ env.cache-name }}
-      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
+      - run: sudo apt-add-repository ppa:lttng/stable-2.13
+      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
       - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
       - name: Load Babeltrace2
         run: |

diff --git a/Makefile.am b/Makefile.am
@@ -1,4 +1,4 @@
 ACLOCAL_AMFLAGS = -I m4
-SUBDIRS = utils xprof sampling opencl ze cuda omp hip
+SUBDIRS = utils xprof sampling opencl ze cuda omp hip libthapictl
 
 EXTRA_DIST = autogen.sh README.md .valgrind/dlopen.supp
diff --git a/configure.ac b/configure.ac
@@ -72,6 +72,7 @@ AM_CONDITIONAL([FOUND_MPI], [test "x$found_mpi" = xyes])
 PKG_CHECK_MODULES([LIBFFI], [libffi >= 3.2])
 PKG_CHECK_MODULES([BABELTRACE2], [babeltrace2 >= 2.0])
 PKG_CHECK_MODULES([LTTNG_UST], [lttng-ust >= 2.10])
+PKG_CHECK_MODULES([LTTNG_CTL], [lttng-ctl >= 2.13.13])
 PKG_CHECK_MODULES([PROTOBUF], [protobuf >= 3.0])
 
 AX_RUBY_EXTENSION([cast-to-yaml], [yes])
@@ -114,6 +115,7 @@ AC_CONFIG_FILES([
 	hip/Makefile
 	xprof/Makefile
 	utils/Makefile
+  libthapictl/Makefile
 ])
 AC_CONFIG_FILES([utils/test_wrapper_thapi_text_pretty.sh], [chmod +x utils/test_wrapper_thapi_text_pretty.sh])
 AC_CONFIG_FILES([opencl/tracer_opencl.sh], [chmod +x opencl/tracer_opencl.sh])

diff --git a/cuda/cuda_model.rb b/cuda/cuda_model.rb
@@ -307,7 +307,7 @@ def upper_snake_case(str)
         fullname = "#{name}#{versions.size - i > 1 ? "_v#{versions.size - i}" : ""}#{suffix}"
         fullname = "#{name}_v2#{suffix}" unless command_names.include?(fullname)
         sstr = <<EOF
-    if (tracepoint_enabled(lttng_ust_cuda, #{fullname}_#{START}) #{suffixes.size > 1 ? "&& #{suffix ? "pt_condition" : "normal_condition" } " : ""}&& cudaVersion >= #{version} && strcmp(symbol, "#{name}") == 0) {
+    if ((!_trace_from_start() || tracepoint_enabled(lttng_ust_cuda, #{fullname}_#{START})) #{suffixes.size > 1 ? "&& #{suffix ? "pt_condition" : "normal_condition" } " : ""}&& cudaVersion >= #{version} && strcmp(symbol, "#{name}") == 0) {
       wrap_#{fullname}(pfn);
     }
 EOF

diff --git a/cuda/tracer_cuda_helpers.include.c b/cuda/tracer_cuda_helpers.include.c
@@ -483,6 +483,15 @@ static inline void _primary_context_reset(CUdevice dev) {
   pthread_mutex_unlock(&_primary_contexts_mutex);
 }
 
+static int _trace_from_start() {
+  static int trace_from_start = -1;
+  if (trace_from_start == -1) {
+    const char *env_value = getenv("LTTNG_UST_TRACE_FROM_START");
+    trace_from_start = (env_value == NULL || strcmp(env_value, "1") == 0);
+  }
+  return trace_from_start;
+}
+
 static void THAPI_ATTRIBUTE_DESTRUCTOR
 _lib_cleanup() {
   if (_do_cleanup) {

diff --git a/integration_tests/gtensor.bats b/integration_tests/gtensor.bats
@@ -0,0 +1,37 @@
+#!/usr/bin/env bats
+
+setup_file() {
+   export IPROF=$THAPI_INSTALL_DIR/bin/iprof
+
+   bdir="build-$GTENSOR_DEVICE"
+
+   (cd integration_tests/gtensor;
+    cmake -S . -B $bdir -DGTENSOR_DEVICE=$GTENSOR_DEVICE \
+       -DTHAPI_PATH=$THAPI_INSTALL_DIR \
+    && cmake --build $bdir)
+
+   export THAPI_TEST_BIN=$(pwd)/integration_tests/gtensor/$bdir/axpy_start_stop
+   export LD_LIBRARY_PATH=$THAPI_INSTALL_DIR/lib:$LD_LIBRARY_PATH
+}
+
+teardown_file() {
+   rm -rf $THAPI_HOME/thapi-traces
+}
+
+@test "axpy-${GTENSOR_DEVICE}-trace-from-start" {
+   $IPROF --debug 0 $THAPI_TEST_BIN $TEST_ARGS
+   # NOTE: gtensor kernel names are complex, but all start with gt::, look it up rather
+   # than hard coding and making it fragile. Includes quotes.
+   kernel_name=$($IPROF -r -j | jq '.device.data | keys[]' | grep 'gt::')
+   write_count=$($IPROF -r -j | jq ".device.data | .[${kernel_name}].call")
+   # 0th call, plus 5 odd calls
+   [ $write_count -eq 6 ]
+}
+
+@test "axpy-${GTENSOR_DEVICE}-no-trace-from-start" {
+   $IPROF --no-trace-from-start --debug 0 $THAPI_TEST_BIN $TEST_ARGS
+   kernel_name=$($IPROF -r -j | jq '.device.data | keys[]' | grep 'gt::')
+   write_count=$($IPROF -r -j | jq ".device.data | .[${kernel_name}].call")
+   # 5 odd calls, does not include 0 since profiling should be disabled at start
+   [ $write_count -eq 5 ]
+}
diff --git a/integration_tests/gtensor/CMakeLists.txt b/integration_tests/gtensor/CMakeLists.txt
@@ -0,0 +1,38 @@
+cmake_minimum_required(VERSION 3.23 FATAL_ERROR)
+
+project(thapi-integration-test)
+
+if(GTENSOR_DEVICE STREQUAL "cuda")
+  enable_language(CUDA)
+  find_package(CUDAToolkit REQUIRED)
+endif()
+
+option(USE_THAPI_CTL "use thapi-ctl to start/stop tracing" ON)
+set(THAPI_PATH "/opt/thapi" CACHE STRING "Path to thapi installation")
+
+message(STATUS "${PROJECT_NAME}: THAPI_PATH=${THAPI_PATH}")
+
+# add dependencies
+include(cmake/CPM.cmake)
+CPMFindPackage(NAME gtensor
+               GITHUB_REPOSITORY wdmapp/gtensor
+               GIT_TAG "main"
+               OPTIONS "GTENSOR_ENABLE_BLAS ON"
+                       "GTENSOR_ENABLE_SOLVER ON")
+
+add_executable(axpy_start_stop)
+target_gtensor_sources(axpy_start_stop PRIVATE axpy_start_stop.cxx)
+target_link_libraries(axpy_start_stop gtensor::gtensor)
+
+if (${USE_THAPI_CTL})
+  find_package(PkgConfig REQUIRED)
+  pkg_check_modules(LTTNG_CTL REQUIRED lttng-ctl)
+
+  add_library(thapi_ctl INTERFACE IMPORTED)
+  target_include_directories(thapi_ctl INTERFACE "${THAPI_PATH}/include")
+  target_link_libraries(thapi_ctl INTERFACE "${THAPI_PATH}/lib/libthapi-ctl.so")
+  target_link_libraries(thapi_ctl INTERFACE ${LTTNG_CTL_LDFLAGS})
+  target_link_libraries(axpy_start_stop thapi_ctl)
+else()
+  target_compile_definitions(axpy_start_stop PRIVATE NO_THAPI_CTL)
+endif()
diff --git a/integration_tests/gtensor/README.md b/integration_tests/gtensor/README.md
@@ -0,0 +1,13 @@
+# gtensor integration tests
+
+Allows testing cuda, hip, and ze backends with single source file.
+
+# Usage
+
+```
+GTENSOR_DEVICE=cuda bats ./integration_tests/gtensor.bats
+```
+Or "hip" or "sycl"
+
+Note that THAPI hip backend does not yet do device profiling, modifications will need to be
+made for tests to pass on hip.
diff --git a/integration_tests/gtensor/axpy_start_stop.cxx b/integration_tests/gtensor/axpy_start_stop.cxx
@@ -0,0 +1,105 @@
+
+#include <iostream>
+
+#include <gtensor/gtensor.h>
+
+#ifndef NO_THAPI_CTL
+#include "thapi-ctl.h"
+#endif
+
+// provides convenient shortcuts for common gtensor functions, for example
+// underscore ('_') to represent open slice ends.
+using namespace gt::placeholders;
+
+/**
+ * Templated function for daxpy that can work with host gtensor or device
+ * gtensor. Relies on C++11 mandatory copy elision to move the result into the
+ * LHS it is assigned to, rather than copying all the data (very important for
+ * large arrays). Input arrays are passed by reference, again to avoid copying
+ * all the data which would happen by default (just like std::vector, gtensor
+ * has copy semantics by default).
+ */
+template <typename Ex, typename Ey>
+auto daxpy(double a, const Ex& x, const Ey& y)
+{
+  // The expression 'a * x + y' generates a gfunction template, which is an
+  // un-evaluated expression. In this case, we force evaluation in the return
+  // statementso a new gtensor is created. It can be useful to not add the
+  // `gt::eval`, in which case the un-evaluated expression will be returned and
+  // can be combined with other operations before being evaluated, or assigned
+  // to a gtensor to force immediate evaluation.
+  return gt::eval(a * x + y);
+}
+
+int main(int argc, char** argv)
+{
+  int n = 1024 * 1024;
+  int nprint = 32;
+
+  double a = 0.5;
+
+  // Define and allocate two 1d vectors of size n on the host. Declare
+  // but don't allocate a third 1d host vector for storing the result.
+  gt::gtensor<double, 1, gt::space::host> h_x(gt::shape(n));
+  gt::gtensor<double, 1, gt::space::host> h_y = gt::empty_like(h_x);
+  gt::gtensor<double, 1, gt::space::host> h_axpy;
+
+  // initialize the vectors, x is twice it's index values and y is equal
+  // to it's index values. We will perform .5*x + y, so the result should be
+  // axpy(i) = 2i.
+  for (int i = 0; i < n; i++) {
+    h_x(i) = 2.0 * static_cast<double>(i);
+    h_y(i) = static_cast<double>(i);
+  }
+
+#ifndef GTENSOR_HAVE_DEVICE
+#error "device required"
+#endif
+
+  // Define and allocate device versions of h_x and h_y, and declare
+  // a varaible for the result on gpu.
+  gt::gtensor<double, 1, gt::space::device> d_x(gt::shape(n));
+  gt::gtensor<double, 1, gt::space::device> d_y = gt::empty_like(d_x);
+  gt::gtensor<double, 1, gt::space::device> d_axpy;
+
+  // Explicit copies of input from host to device. Note that this is an
+  // overload of the copy function for gtensor and gtensor_span types, not
+  // std::copy which has a different signature. The source is the first
+  // argument and destination the second argument. Currently thrust::copy is
+  // used under the hood in the implementation.
+  gt::copy(h_x, d_x);
+  gt::copy(h_y, d_y);
+
+  // This automatically generates a computation kernel to run on the
+  // device.
+  auto expr = a * d_x + d_y;
+
+  for (int i = 0; i < 10; i++) {
+#ifndef NO_THAPI_CTL
+    if (i % 2 == 1) {
+      thapi_ctl_start();
+    }
+#endif
+
+    d_axpy = gt::eval(expr);
+    h_axpy = gt::empty_like(h_x);
+    gt::copy(d_axpy, h_axpy);
+    gt::synchronize();
+
+    std::cout << "axpy(10) = " << h_axpy(10) << std::endl;
+
+#ifndef NO_THAPI_CTL
+    if (i % 2 == 1) {
+      thapi_ctl_stop();
+    }
+#endif
+  }
+
+  // Define a slice to print a subset of elements for spot checking the
+  // result.
+  auto print_slice = gt::gslice(_, _, n / nprint);
+  std::cout << "a       = " << a << std::endl;
+  std::cout << "x       = " << h_x.view(print_slice) << std::endl;
+  std::cout << "y       = " << h_y.view(print_slice) << std::endl;
+  std::cout << "a*x + y = " << h_axpy.view(print_slice) << std::endl;
+}
diff --git a/integration_tests/gtensor/cmake/CPM.cmake b/integration_tests/gtensor/cmake/CPM.cmake
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: MIT
+#
+# SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors
+
+set(CPM_DOWNLOAD_VERSION 0.39.0)
+set(CPM_HASH_SUM "66639bcac9dd2907b2918de466783554c1334446b9874e90d38e3778d404c2ef")
+
+if(CPM_SOURCE_CACHE)
+  set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+elseif(DEFINED ENV{CPM_SOURCE_CACHE})
+  set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+else()
+  set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+endif()
+
+# Expand relative path. This is important if the provided path contains a tilde (~)
+get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE)
+
+file(DOWNLOAD
+     https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
+     ${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM}
+)
+
+include(${CPM_DOWNLOAD_LOCATION})