diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml
index 7380144d..5c69ec14 100644
--- a/.github/workflows/presubmit.yml
+++ b/.github/workflows/presubmit.yml
@@ -28,7 +28,8 @@ jobs:
         with:
           path: ~/babeltrace2/2.0.4
           key: ${{ runner.os }}-build-${{ env.cache-name }}
-      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler
+      - run: sudo apt-add-repository ppa:lttng/stable-2.13
+      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler
         if: steps.babeltrace2.outputs.cache-hit != 'true'
       - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
         if: steps.babeltrace2.outputs.cache-hit != 'true'
@@ -69,7 +70,8 @@ jobs:
         with:
           path: ~/babeltrace2/2.0.4
           key: ${{ runner.os }}-build-${{ env.cache-name }}
-      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
+      - run: sudo apt-add-repository ppa:lttng/stable-2.13
+      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
       - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
       - name: Load Babeltrace2
         run: |
@@ -112,7 +114,8 @@ jobs:
         with:
           path: ~/babeltrace2/2.0.4
           key: ${{ runner.os }}-build-${{ env.cache-name }}
-      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler 
+      - run: sudo apt-add-repository ppa:lttng/stable-2.13
+      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler 
       - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
       - name: Load Babeltrace2
         run: |
@@ -146,9 +149,12 @@ jobs:
         include:
           - bats_file: 'iprof'
             thapi_sync_daemon: 'fs' # Need to force, if not `THAPI_SYNC_DAEMON` will be set to empty list
+          - bats_file: 'libthapi-ctl'
+            thapi_sync_daemon: 'fs'
     env:
       THAPI_TEST_BIN: clinfo
-      THAPI_BIN_DIR: ./build/ici/bin
+      THAPI_BIN_DIR: ${{ github.workspace }}/build/ici/bin
+      THAPI_INSTALL_DIR: ${{ github.workspace }}/build/ici
       THAPI_SYNC_DAEMON: ${{ matrix.thapi_sync_daemon }}
     steps:
       - uses: actions/checkout@v4
@@ -165,12 +171,16 @@ jobs:
           name: thapi-bin
       - name: Untar THAPI
         run: tar -xvf thapi.tar
-      - run: sudo apt update; sudo apt install -y lttng-tools liblttng-ust-dev ruby ruby-dev libprotobuf-dev libpocl2 clinfo bats coreutils
+      - run: sudo apt-add-repository ppa:lttng/stable-2.13
+      - run: sudo apt update; sudo apt install -y lttng-tools liblttng-ust-dev ruby ruby-dev libprotobuf-dev libpocl2 clinfo bats coreutils liblttng-ctl-dev ocl-icd-opencl-dev jq
       - run: sudo gem install babeltrace2 opencl_ruby_ffi
       - name: Load Babeltrace2
         run: |
           echo "$HOME/babeltrace2/2.0.4/bin" >> $GITHUB_PATH
           echo "LD_LIBRARY_PATH=$HOME/babeltrace2/2.0.4/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV
+      - name: ldconfig
+        run: |
+          ldconfig -p | grep libOpenCL
       - name: integration test ${{ matrix.bats_file }}
         run: |
           bats integration_tests/${{ matrix.bats_file }}.bats
@@ -189,7 +199,8 @@ jobs:
         with:
           path: ~/babeltrace2/2.0.4
           key: ${{ runner.os }}-build-${{ env.cache-name }}
-      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
+      - run: sudo apt-add-repository ppa:lttng/stable-2.13
+      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
       - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
       - name: Load Babeltrace2
         run: |
@@ -227,7 +238,8 @@ jobs:
         with:
           path: ~/babeltrace2/2.0.4
           key: ${{ runner.os }}-build-${{ env.cache-name }}
-      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
+      - run: sudo apt-add-repository ppa:lttng/stable-2.13
+      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
       - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
       - name: Load Babeltrace2
         run: |
@@ -259,7 +271,8 @@ jobs:
         with:
           path: ~/babeltrace2/2.0.4
           key: ${{ runner.os }}-build-${{ env.cache-name }}
-      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
+      - run: sudo apt-add-repository ppa:lttng/stable-2.13
+      - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
       - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
       - name: Load Babeltrace2
         run: |
diff --git a/Makefile.am b/Makefile.am
index 5b9ba8f1..bc244edb 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,4 +1,4 @@
 ACLOCAL_AMFLAGS = -I m4
-SUBDIRS = utils xprof sampling opencl ze cuda omp hip
+SUBDIRS = utils xprof sampling opencl ze cuda omp hip libthapictl
 
 EXTRA_DIST = autogen.sh README.md .valgrind/dlopen.supp
diff --git a/configure.ac b/configure.ac
index e8d3b8a2..e5832d4d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -72,6 +72,7 @@ AM_CONDITIONAL([FOUND_MPI], [test "x$found_mpi" = xyes])
 PKG_CHECK_MODULES([LIBFFI], [libffi >= 3.2])
 PKG_CHECK_MODULES([BABELTRACE2], [babeltrace2 >= 2.0])
 PKG_CHECK_MODULES([LTTNG_UST], [lttng-ust >= 2.10])
+PKG_CHECK_MODULES([LTTNG_CTL], [lttng-ctl >= 2.13.13])
 PKG_CHECK_MODULES([PROTOBUF], [protobuf >= 3.0])
 
 AX_RUBY_EXTENSION([cast-to-yaml], [yes])
@@ -114,6 +115,7 @@ AC_CONFIG_FILES([
 	hip/Makefile
 	xprof/Makefile
 	utils/Makefile
+  libthapictl/Makefile
 ])
 AC_CONFIG_FILES([utils/test_wrapper_thapi_text_pretty.sh], [chmod +x utils/test_wrapper_thapi_text_pretty.sh])
 AC_CONFIG_FILES([opencl/tracer_opencl.sh], [chmod +x opencl/tracer_opencl.sh])
diff --git a/cuda/cuda_model.rb b/cuda/cuda_model.rb
index 0d49bf1d..18f96c7d 100644
--- a/cuda/cuda_model.rb
+++ b/cuda/cuda_model.rb
@@ -307,7 +307,7 @@ def upper_snake_case(str)
         fullname = "#{name}#{versions.size - i > 1 ? "_v#{versions.size - i}" : ""}#{suffix}"
         fullname = "#{name}_v2#{suffix}" unless command_names.include?(fullname)
         sstr = <<EOF
-    if (tracepoint_enabled(lttng_ust_cuda, #{fullname}_#{START}) #{suffixes.size > 1 ? "&& #{suffix ? "pt_condition" : "normal_condition" } " : ""}&& cudaVersion >= #{version} && strcmp(symbol, "#{name}") == 0) {
+    if ((!_trace_from_start() || tracepoint_enabled(lttng_ust_cuda, #{fullname}_#{START})) #{suffixes.size > 1 ? "&& #{suffix ? "pt_condition" : "normal_condition" } " : ""}&& cudaVersion >= #{version} && strcmp(symbol, "#{name}") == 0) {
       wrap_#{fullname}(pfn);
     }
 EOF
diff --git a/cuda/tracer_cuda_helpers.include.c b/cuda/tracer_cuda_helpers.include.c
index c18d1bac..0df77acb 100644
--- a/cuda/tracer_cuda_helpers.include.c
+++ b/cuda/tracer_cuda_helpers.include.c
@@ -483,6 +483,15 @@ static inline void _primary_context_reset(CUdevice dev) {
   pthread_mutex_unlock(&_primary_contexts_mutex);
 }
 
+static int _trace_from_start() {
+  static int trace_from_start = -1;
+  if (trace_from_start == -1) {
+    const char *env_value = getenv("LTTNG_UST_TRACE_FROM_START");
+    trace_from_start = (env_value == NULL || strcmp(env_value, "1") == 0);
+  }
+  return trace_from_start;
+}
+
 static void THAPI_ATTRIBUTE_DESTRUCTOR
 _lib_cleanup() {
   if (_do_cleanup) {
diff --git a/integration_tests/gtensor.bats b/integration_tests/gtensor.bats
new file mode 100644
index 00000000..89fda79f
--- /dev/null
+++ b/integration_tests/gtensor.bats
@@ -0,0 +1,37 @@
+#!/usr/bin/env bats
+
+setup_file() {
+   export IPROF=$THAPI_INSTALL_DIR/bin/iprof
+
+   bdir="build-$GTENSOR_DEVICE"
+
+   (cd integration_tests/gtensor;
+    cmake -S . -B $bdir -DGTENSOR_DEVICE=$GTENSOR_DEVICE \
+       -DTHAPI_PATH=$THAPI_INSTALL_DIR \
+    && cmake --build $bdir)
+
+   export THAPI_TEST_BIN=$(pwd)/integration_tests/gtensor/$bdir/axpy_start_stop
+   export LD_LIBRARY_PATH=$THAPI_INSTALL_DIR/lib:$LD_LIBRARY_PATH
+}
+
+teardown_file() {
+   rm -rf $THAPI_HOME/thapi-traces
+}
+
+@test "axpy-${GTENSOR_DEVICE}-trace-from-start" {
+   $IPROF --debug 0 $THAPI_TEST_BIN $TEST_ARGS
+   # NOTE: gtensor kernel names are complex, but all start with gt::, look it up rather
+   # than hard coding and making it fragile. Includes quotes.
+   kernel_name=$($IPROF -r -j | jq '.device.data | keys[]' | grep 'gt::')
+   write_count=$($IPROF -r -j | jq ".device.data | .[${kernel_name}].call")
+   # 0th call, plus 5 odd calls
+   [ $write_count -eq 6 ]
+}
+
+@test "axpy-${GTENSOR_DEVICE}-no-trace-from-start" {
+   $IPROF --no-trace-from-start --debug 0 $THAPI_TEST_BIN $TEST_ARGS
+   kernel_name=$($IPROF -r -j | jq '.device.data | keys[]' | grep 'gt::')
+   write_count=$($IPROF -r -j | jq ".device.data | .[${kernel_name}].call")
+   # 5 odd calls, does not include 0 since profiling should be disabled at start
+   [ $write_count -eq 5 ]
+}
diff --git a/integration_tests/gtensor/CMakeLists.txt b/integration_tests/gtensor/CMakeLists.txt
new file mode 100644
index 00000000..519b899c
--- /dev/null
+++ b/integration_tests/gtensor/CMakeLists.txt
@@ -0,0 +1,38 @@
+cmake_minimum_required(VERSION 3.23 FATAL_ERROR)
+
+project(thapi-integration-test)
+
+if(GTENSOR_DEVICE STREQUAL "cuda")
+  enable_language(CUDA)
+  find_package(CUDAToolkit REQUIRED)
+endif()
+
+option(USE_THAPI_CTL "use thapi-ctl to start/stop tracing" ON)
+set(THAPI_PATH "/opt/thapi" CACHE STRING "Path to thapi installation")
+
+message(STATUS "${PROJECT_NAME}: THAPI_PATH=${THAPI_PATH}")
+
+# add dependencies
+include(cmake/CPM.cmake)
+CPMFindPackage(NAME gtensor
+               GITHUB_REPOSITORY wdmapp/gtensor
+               GIT_TAG "main"
+               OPTIONS "GTENSOR_ENABLE_BLAS ON"
+                       "GTENSOR_ENABLE_SOLVER ON")
+
+add_executable(axpy_start_stop)
+target_gtensor_sources(axpy_start_stop PRIVATE axpy_start_stop.cxx)
+target_link_libraries(axpy_start_stop gtensor::gtensor)
+
+if (${USE_THAPI_CTL})
+  find_package(PkgConfig REQUIRED)
+  pkg_check_modules(LTTNG_CTL REQUIRED lttng-ctl)
+
+  add_library(thapi_ctl INTERFACE IMPORTED)
+  target_include_directories(thapi_ctl INTERFACE "${THAPI_PATH}/include")
+  target_link_libraries(thapi_ctl INTERFACE "${THAPI_PATH}/lib/libthapi-ctl.so")
+  target_link_libraries(thapi_ctl INTERFACE ${LTTNG_CTL_LDFLAGS})
+  target_link_libraries(axpy_start_stop thapi_ctl)
+else()
+  target_compile_definitions(axpy_start_stop PRIVATE NO_THAPI_CTL)
+endif()
diff --git a/integration_tests/gtensor/README.md b/integration_tests/gtensor/README.md
new file mode 100644
index 00000000..c849612c
--- /dev/null
+++ b/integration_tests/gtensor/README.md
@@ -0,0 +1,13 @@
+# gtensor integration tests
+
+Allows testing cuda, hip, and ze backends with single source file.
+
+# Usage
+
+```
+GTENSOR_DEVICE=cuda bats ./integration_tests/gtensor.bats
+```
+Or "hip" or "sycl"
+
+Note that THAPI hip backend does not yet do device profiling, modifications will need to be
+made for tests to pass on hip.
diff --git a/integration_tests/gtensor/axpy_start_stop.cxx b/integration_tests/gtensor/axpy_start_stop.cxx
new file mode 100644
index 00000000..0eb60fa9
--- /dev/null
+++ b/integration_tests/gtensor/axpy_start_stop.cxx
@@ -0,0 +1,105 @@
+
+#include <iostream>
+
+#include <gtensor/gtensor.h>
+
+#ifndef NO_THAPI_CTL
+#include "thapi-ctl.h"
+#endif
+
+// provides convenient shortcuts for common gtensor functions, for example
+// underscore ('_') to represent open slice ends.
+using namespace gt::placeholders;
+
+/**
+ * Templated function for daxpy that can work with host gtensor or device
+ * gtensor. Relies on C++11 mandatory copy elision to move the result into the
+ * LHS it is assigned to, rather than copying all the data (very important for
+ * large arrays). Input arrays are passed by reference, again to avoid copying
+ * all the data which would happen by default (just like std::vector, gtensor
+ * has copy semantics by default).
+ */
+template <typename Ex, typename Ey>
+auto daxpy(double a, const Ex& x, const Ey& y)
+{
+  // The expression 'a * x + y' generates a gfunction template, which is an
+  // un-evaluated expression. In this case, we force evaluation in the return
+  // statementso a new gtensor is created. It can be useful to not add the
+  // `gt::eval`, in which case the un-evaluated expression will be returned and
+  // can be combined with other operations before being evaluated, or assigned
+  // to a gtensor to force immediate evaluation.
+  return gt::eval(a * x + y);
+}
+
+int main(int argc, char** argv)
+{
+  int n = 1024 * 1024;
+  int nprint = 32;
+
+  double a = 0.5;
+
+  // Define and allocate two 1d vectors of size n on the host. Declare
+  // but don't allocate a third 1d host vector for storing the result.
+  gt::gtensor<double, 1, gt::space::host> h_x(gt::shape(n));
+  gt::gtensor<double, 1, gt::space::host> h_y = gt::empty_like(h_x);
+  gt::gtensor<double, 1, gt::space::host> h_axpy;
+
+  // initialize the vectors, x is twice it's index values and y is equal
+  // to it's index values. We will perform .5*x + y, so the result should be
+  // axpy(i) = 2i.
+  for (int i = 0; i < n; i++) {
+    h_x(i) = 2.0 * static_cast<double>(i);
+    h_y(i) = static_cast<double>(i);
+  }
+
+#ifndef GTENSOR_HAVE_DEVICE
+#error "device required"
+#endif
+
+  // Define and allocate device versions of h_x and h_y, and declare
+  // a varaible for the result on gpu.
+  gt::gtensor<double, 1, gt::space::device> d_x(gt::shape(n));
+  gt::gtensor<double, 1, gt::space::device> d_y = gt::empty_like(d_x);
+  gt::gtensor<double, 1, gt::space::device> d_axpy;
+
+  // Explicit copies of input from host to device. Note that this is an
+  // overload of the copy function for gtensor and gtensor_span types, not
+  // std::copy which has a different signature. The source is the first
+  // argument and destination the second argument. Currently thrust::copy is
+  // used under the hood in the implementation.
+  gt::copy(h_x, d_x);
+  gt::copy(h_y, d_y);
+
+  // This automatically generates a computation kernel to run on the
+  // device.
+  auto expr = a * d_x + d_y;
+
+  for (int i = 0; i < 10; i++) {
+#ifndef NO_THAPI_CTL
+    if (i % 2 == 1) {
+      thapi_ctl_start();
+    }
+#endif
+
+    d_axpy = gt::eval(expr);
+    h_axpy = gt::empty_like(h_x);
+    gt::copy(d_axpy, h_axpy);
+    gt::synchronize();
+
+    std::cout << "axpy(10) = " << h_axpy(10) << std::endl;
+
+#ifndef NO_THAPI_CTL
+    if (i % 2 == 1) {
+      thapi_ctl_stop();
+    }
+#endif
+  }
+
+  // Define a slice to print a subset of elements for spot checking the
+  // result.
+  auto print_slice = gt::gslice(_, _, n / nprint);
+  std::cout << "a       = " << a << std::endl;
+  std::cout << "x       = " << h_x.view(print_slice) << std::endl;
+  std::cout << "y       = " << h_y.view(print_slice) << std::endl;
+  std::cout << "a*x + y = " << h_axpy.view(print_slice) << std::endl;
+}
diff --git a/integration_tests/gtensor/cmake/CPM.cmake b/integration_tests/gtensor/cmake/CPM.cmake
new file mode 100644
index 00000000..d0fd0e8e
--- /dev/null
+++ b/integration_tests/gtensor/cmake/CPM.cmake
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: MIT
+#
+# SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors
+
+set(CPM_DOWNLOAD_VERSION 0.39.0)
+set(CPM_HASH_SUM "66639bcac9dd2907b2918de466783554c1334446b9874e90d38e3778d404c2ef")
+
+if(CPM_SOURCE_CACHE)
+  set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+elseif(DEFINED ENV{CPM_SOURCE_CACHE})
+  set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+else()
+  set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+endif()
+
+# Expand relative path. This is important if the provided path contains a tilde (~)
+get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE)
+
+file(DOWNLOAD
+     https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
+     ${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM}
+)
+
+include(${CPM_DOWNLOAD_LOCATION})
diff --git a/integration_tests/libthapi-ctl.bats b/integration_tests/libthapi-ctl.bats
new file mode 100644
index 00000000..87f8ca3c
--- /dev/null
+++ b/integration_tests/libthapi-ctl.bats
@@ -0,0 +1,64 @@
+#!/usr/bin/env bats
+
+setup_file() {
+   export IPROF=$THAPI_INSTALL_DIR/bin/iprof
+
+   (cd integration_tests/libthapi-ctl;
+    #ld --verbose | grep SEARCH_DIR | tr -s ' ;' \\012;
+    #gcc -print-search-dirs | grep libraries | tr -s ' ;=:' \\012;
+    make clean && make -j;
+    #ldd hello-cl-startstop;
+    #objdump -p hello-cl-startstop;
+    )
+
+   export THAPI_TEST_BIN=$(pwd)/integration_tests/libthapi-ctl/hello-cl-startstop
+   export LD_LIBRARY_PATH=$THAPI_INSTALL_DIR/lib:$LD_LIBRARY_PATH
+
+   # Note: the pocl version in Ubuntu 20.04 (old lts as of 2024) does not
+   # support modern CPUs and will fail kernel compilation (notably does not work
+   # on CI). To keep things simple, we do clEnqueueWriteBuffer only if pocl is the
+   # default opencl platform.
+   IS_POCL=$(clinfo -l | grep 'Platform #0' | grep -c "Portable Computing Language" || true)
+
+   # echo "# IS_POCL=$IS_POCL" >&3
+
+   TEST_ARGS=""
+   if [ "$IS_POCL" = "0" ]; then
+     TEST_ARGS="0 0 1"
+   fi
+   export TEST_ARGS IS_POCL
+}
+
+teardown_file() {
+   rm -rf $THAPI_HOME/thapi-traces
+}
+
+@test "startstop-cl-trace-from-start" {
+   $IPROF --debug 0 $THAPI_TEST_BIN $TEST_ARGS
+   TRACE=$(ls -1t $THAPI_HOME/thapi-traces | head -n1)
+   write_count=$($IPROF -r -j | jq '.device.data.clEnqueueWriteBuffer.call')
+   # 0th call, plus 5 odd calls
+   [ $write_count -eq 6 ]
+
+   # echo "# IS_POCL=$IS_POCL" >&3
+
+   if [ "$IS_POCL" = "0" ]; then
+     kernel_count=$($IPROF -r -j | jq '.device.data.hello_world.call')
+     [ $write_count -eq 6 ]
+   fi
+}
+
+@test "startstop-cl-no-trace-from-start" {
+   $IPROF --no-trace-from-start --debug 0 $THAPI_TEST_BIN $TEST_ARGS
+   write_count=$($IPROF -r -j | jq '.device.data.clEnqueueWriteBuffer.call')
+   # 5 odd calls, does not include 0 since profiling should be disabled at start
+   [ $write_count -eq 5 ]
+
+   # echo "# IS_POCL=$IS_POCL" >&3
+
+   if [ "$IS_POCL" = "0" ]; then
+     kernel_count=$($IPROF -r -j | jq '.device.data.hello_world.call')
+     [ $write_count -eq 5 ]
+     # echo "# IS_POCL=$IS_POCL write_count=$write_count" >&3
+   fi
+}
diff --git a/integration_tests/libthapi-ctl/Makefile b/integration_tests/libthapi-ctl/Makefile
new file mode 100644
index 00000000..0b1e437c
--- /dev/null
+++ b/integration_tests/libthapi-ctl/Makefile
@@ -0,0 +1,13 @@
+CFLAGS = -g -Wall -Wextra $(WERROR) -DCL_TARGET_OPENCL_VERSION=220 -DCL_USE_DEPRECATED_OPENCL_1_2_APIS -I$(THAPI_INSTALL_DIR)/include
+LDFLAGS = /usr/lib/x86_64-linux-gnu/libOpenCL.so -L$(THAPI_INSTALL_DIR)/lib -lthapi-ctl -llttng-ctl
+
+
+.PHONY: all
+all: hello-cl-startstop
+
+hello-cl-startstop: hello-cl-startstop.c cl_utils.c
+	gcc $(CFLAGS) $^ $(LDFLAGS) -o $@
+
+.PHONY: clean
+clean:
+	rm -f hello-cl-startstop
diff --git a/integration_tests/libthapi-ctl/cl_utils.c b/integration_tests/libthapi-ctl/cl_utils.c
new file mode 100644
index 00000000..548392b3
--- /dev/null
+++ b/integration_tests/libthapi-ctl/cl_utils.c
@@ -0,0 +1,163 @@
+#include <stdio.h>
+#include <CL/opencl.h>
+#include "./cl_utils.h"
+
+//  _
+// |_)  _  |  _  ._ ._  |  _. _|_  _
+// |_) (_) | (/_ |  |_) | (_|  |_ (/_
+//                  |
+
+/* - - - -
+IO
+- - - - */
+int read_from_binary(unsigned char **output, size_t *size, const char *name) {
+  FILE *fp = fopen(name, "rb");
+  if (!fp) {
+    return -1;
+  }
+
+  fseek(fp, 0, SEEK_END);
+  *size = ftell(fp);
+  fseek(fp, 0, SEEK_SET);
+
+  *output = (unsigned char *)malloc(*size * sizeof(unsigned char));
+  if (!*output) {
+    fclose(fp);
+    return -1;
+  }
+
+  size_t ret = fread(*output, *size, 1, fp);
+  fclose(fp);
+  if (ret != *size)
+    return -1;
+
+  return 0;
+}
+
+int write_to_binary(unsigned char *input, size_t size, const char *name) {
+  FILE *fp = fopen(name, "wb");
+  if (!fp) {
+    return -1;
+  }
+  fwrite(input, 1, size, fp);
+  fclose(fp);
+  return 0;
+}
+
+char *read_from_file(const char *filename)
+{
+    size_t size = 0;
+    FILE *file = fopen(filename, "r");
+
+    if(!file) {
+        fputs("File error.\n", stderr);
+        return NULL;
+    }
+
+    fseek(file, 0, SEEK_END);
+    size = ftell(file);
+    rewind(file);
+
+    char *result = (char *) malloc(size + 1);
+    result[size] = '\0';
+    if(!result) {
+        fputs("Memory error.\n", stderr);
+        return NULL;
+    }
+
+    if(fread(result, 1, size, file) != size) {
+        fputs("Read error.\n", stderr);
+        return NULL;
+    }
+
+    fclose(file);
+    return result;
+}
+/* - - - -
+OpenCL Error
+- - - - */
+
+const char *getErrorString(cl_int error)
+{
+switch(error){
+    // run-time and JIT compiler errors
+    case 0: return "CL_SUCCESS";
+    case -1: return "CL_DEVICE_NOT_FOUND";
+    case -2: return "CL_DEVICE_NOT_AVAILABLE";
+    case -3: return "CL_COMPILER_NOT_AVAILABLE";
+    case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+    case -5: return "CL_OUT_OF_RESOURCES";
+    case -6: return "CL_OUT_OF_HOST_MEMORY";
+    case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
+    case -8: return "CL_MEM_COPY_OVERLAP";
+    case -9: return "CL_IMAGE_FORMAT_MISMATCH";
+    case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+    case -11: return "CL_BUILD_PROGRAM_FAILURE";
+    case -12: return "CL_MAP_FAILURE";
+    case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
+    case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+    case -15: return "CL_COMPILE_PROGRAM_FAILURE";
+    case -16: return "CL_LINKER_NOT_AVAILABLE";
+    case -17: return "CL_LINK_PROGRAM_FAILURE";
+    case -18: return "CL_DEVICE_PARTITION_FAILED";
+    case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+
+    // compile-time errors
+    case -30: return "CL_INVALID_VALUE";
+    case -31: return "CL_INVALID_DEVICE_TYPE";
+    case -32: return "CL_INVALID_PLATFORM";
+    case -33: return "CL_INVALID_DEVICE";
+    case -34: return "CL_INVALID_CONTEXT";
+    case -35: return "CL_INVALID_QUEUE_PROPERTIES";
+    case -36: return "CL_INVALID_COMMAND_QUEUE";
+    case -37: return "CL_INVALID_HOST_PTR";
+    case -38: return "CL_INVALID_MEM_OBJECT";
+    case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+    case -40: return "CL_INVALID_IMAGE_SIZE";
+    case -41: return "CL_INVALID_SAMPLER";
+    case -42: return "CL_INVALID_BINARY";
+    case -43: return "CL_INVALID_BUILD_OPTIONS";
+    case -44: return "CL_INVALID_PROGRAM";
+    case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
+    case -46: return "CL_INVALID_KERNEL_NAME";
+    case -47: return "CL_INVALID_KERNEL_DEFINITION";
+    case -48: return "CL_INVALID_KERNEL";
+    case -49: return "CL_INVALID_ARG_INDEX";
+    case -50: return "CL_INVALID_ARG_VALUE";
+    case -51: return "CL_INVALID_ARG_SIZE";
+    case -52: return "CL_INVALID_KERNEL_ARGS";
+    case -53: return "CL_INVALID_WORK_DIMENSION";
+    case -54: return "CL_INVALID_WORK_GROUP_SIZE";
+    case -55: return "CL_INVALID_WORK_ITEM_SIZE";
+    case -56: return "CL_INVALID_GLOBAL_OFFSET";
+    case -57: return "CL_INVALID_EVENT_WAIT_LIST";
+    case -58: return "CL_INVALID_EVENT";
+    case -59: return "CL_INVALID_OPERATION";
+    case -60: return "CL_INVALID_GL_OBJECT";
+    case -61: return "CL_INVALID_BUFFER_SIZE";
+    case -62: return "CL_INVALID_MIP_LEVEL";
+    case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
+    case -64: return "CL_INVALID_PROPERTY";
+    case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
+    case -66: return "CL_INVALID_COMPILER_OPTIONS";
+    case -67: return "CL_INVALID_LINKER_OPTIONS";
+    case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
+    case -69:  return "CL_INVALID_PIPE_SIZE";
+    case -70: return "CL_INVALID_DEVICE_QUEUE";
+
+    default: return "Unknown OpenCL error";
+ }
+}
+
+void check_error(cl_int error, char const *name) {
+    if (error != CL_SUCCESS) {
+        fprintf(stderr, "Non-successful return code %d (%s) for %s.  Exiting.\n", error, getErrorString(error), name);
+        exit(EXIT_FAILURE);
+    }
+}
+
+void exit_msg(char const *str){
+        fprintf(stderr, "%s \n", str);
+        exit(EXIT_FAILURE);
+}
+
diff --git a/integration_tests/libthapi-ctl/cl_utils.h b/integration_tests/libthapi-ctl/cl_utils.h
new file mode 100644
index 00000000..cac450fc
--- /dev/null
+++ b/integration_tests/libthapi-ctl/cl_utils.h
@@ -0,0 +1,10 @@
+extern int read_from_binary(unsigned char **output, size_t *size, const char *name);
+extern int write_to_binary(unsigned char *input, size_t size, const char *name);
+
+extern char *read_from_file(const char *filename);
+
+extern const char *getErrorString(cl_int error);
+
+extern void check_error(cl_int error, char const *name);
+
+extern void exit_msg(char const *str);
diff --git a/integration_tests/libthapi-ctl/hello-cl-startstop.c b/integration_tests/libthapi-ctl/hello-cl-startstop.c
new file mode 100644
index 00000000..2ad9ad1a
--- /dev/null
+++ b/integration_tests/libthapi-ctl/hello-cl-startstop.c
@@ -0,0 +1,218 @@
+// Includes
+#include <stdio.h>
+#include <string.h>
+#include <CL/opencl.h>
+#include "./cl_utils.h"
+
+#include "thapi-ctl.h"
+
+#define NLOOP 10
+
+int main(int argc, char* argv[]) {
+
+   cl_int err;
+
+   //  _              _                      _
+   // |_) |  _. _|_ _|_ _  ._ ._ _    ()    | \  _     o  _  _
+   // |   | (_|  |_  | (_) |  | | |   (_X   |_/ (/_ \/ | (_ (/_
+   //
+    printf(">>> Initializing OpenCL Platform and Device...\n");
+
+    cl_uint platform_idx = 0;
+    cl_uint device_idx = 0;
+    cl_bool run_kernel = CL_FALSE;
+
+    if (argc > 1)
+      platform_idx = (cl_uint) atoi(argv[1]);
+    if (argc > 2)
+      device_idx = (cl_uint) atoi(argv[2]);
+    if (argc > 3)
+      run_kernel = CL_TRUE;
+
+    char name[128];
+    /* - - -
+    Plateform
+    - - - - */
+    //A platform is a specific OpenCL implementation, for instance AMD, NVIDIA or Intel.
+    // Intel may have a different OpenCL implementation for the CPU and GPU.
+
+    // Discover the number of platforms:
+    cl_uint platform_count;
+    err = clGetPlatformIDs(0, NULL, &platform_count);
+    check_error(err, "clGetPlatformIds");
+
+    // Now ask OpenCL for the platform IDs:
+    cl_platform_id* platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * platform_count);
+    err = clGetPlatformIDs(platform_count, platforms, NULL);
+    check_error(err, "clGetPlatformIds");
+
+    cl_platform_id platform = platforms[platform_idx];
+    err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, 128, name, NULL);
+    check_error(err, "clGetPlatformInfo");
+
+    printf("Platform #%d: %s\n", platform_idx, name);
+
+    /* - - - -
+    Device
+    - - - - */
+    // Device gather data
+    cl_uint device_count;
+    err = clGetDeviceIDs(platform,  CL_DEVICE_TYPE_ALL , 0, NULL, &device_count);
+    check_error(err, "clGetdeviceIds");
+
+    cl_device_id* devices = (cl_device_id*)malloc(sizeof(cl_device_id) * device_count);
+    err = clGetDeviceIDs(platform,  CL_DEVICE_TYPE_ALL , device_count, devices, NULL);
+    check_error(err, "clGetdeviceIds");
+
+    cl_device_id device = devices[device_idx];
+    err = clGetDeviceInfo(device, CL_DEVICE_NAME, 128, name, NULL);
+    check_error(err, "clGetPlatformInfo");
+
+    printf("-- Device #%d: %s\n", device_idx, name);
+
+    /* - - - -
+    Context
+    - - - - */
+    // A context is a platform with a set of available devices for that platform.
+    cl_context context = clCreateContext(0, device_count, devices, NULL, NULL, &err);
+    check_error(err,"clCreateContext");
+
+    const char *kernelstring =
+        "__kernel void hello_world() {"
+        "   const int world_rank = get_global_id(0);" 
+        "   printf(\"Hello world from rank %d \\n\", world_rank);" 
+        "}";
+
+    /* - - - -
+    Command queue
+    - - - - */
+    // The OpenCL functions that are submitted to a command-queue are enqueued in the order the calls are made but can be configured to execute in-order or out-of-order.
+    cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
+    check_error(err,"clCreateCommandQueue");
+
+    // create a device buffer
+    size_t a_size = 100;
+    size_t batch_size = a_size / NLOOP;
+    size_t offset = 0;
+    size_t a_bytes = a_size * sizeof(float);
+    float *h_a = (float*)malloc(a_bytes);
+    for (unsigned int i = 0; i < a_size; i++) {
+        h_a[i] = (float)i;
+    }
+    cl_mem d_a = clCreateBuffer(context, CL_MEM_READ_WRITE, a_bytes, NULL, &err);
+    check_error(err,"cclCreateBuffer");
+
+    // define program and kernel variables, used if run_kernel was requested in
+    // 3rd arg
+    cl_program program;
+    cl_kernel kernel;
+#define WORK_DIM 1
+
+    // Describe the number of global work-items in work_dim dimensions that will execute the kernel function
+    const size_t global[WORK_DIM] = { 64 };
+
+    // Describe the number of work-items that make up a work-group (also referred to as the size of the work-group).
+    // local_work_size can also be a NULL value in which case the OpenCL implementation will determine how to be break the global work-items into appropriate work-group instances.
+    const size_t local[WORK_DIM] = { 32 };
+
+    if (run_kernel) {
+      // Create the program
+      program = clCreateProgramWithSource(context, 1, &kernelstring, NULL, &err);
+      check_error(err,"clCreateProgramWithSource");
+
+      //Build / Compile the program executable
+      err = clBuildProgram(program, device_count, devices, "-cl-unsafe-math-optimizations", NULL, NULL);
+      if (err != CL_SUCCESS)
+      {
+          printf("Error: Failed to build program executable!\n");
+
+          size_t logSize;
+          clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
+
+          // there's no information in the reference whether the string is 0 terminated or not.
+          char* messages = (char*)malloc((1+logSize)*sizeof(char));
+          clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, messages, NULL);
+          messages[logSize] = '\0';
+
+          printf("%s", messages);
+          free(messages);
+          return EXIT_FAILURE;
+      }
+
+     /* - - - -
+      Create
+      - - - - */
+      kernel = clCreateKernel(program, "hello_world", &err);
+      check_error(err,"clCreateKernel");
+
+      /* - - - -
+      ND range
+      - - - - */
+      printf(">>> NDrange configuration...\n");
+
+      printf("Global work size: %zu \n", global[0]);
+      printf("Local work size: %zu \n", local[0]);
+
+    }
+
+    /* - - - -
+    Execute
+    - - - - */
+    for (unsigned int i = 0; i < NLOOP; i++) {
+      printf(">>> WriteBuffer Execution [%d]...\n", i);
+
+      // trace only odd executions
+      if (i % 2 == 1)
+         thapi_ctl_start();
+
+    // (cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, size_t offset, size_t size, const void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event);
+      err  = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, offset * sizeof(float), batch_size * sizeof(float), h_a, 0, NULL, NULL);
+      check_error(err,"clEnqueueWriteBuffer");
+
+      if (run_kernel) {
+        printf(">>> Kernel Execution [%d]...\n", i);
+        err  = clEnqueueNDRangeKernel(queue, kernel, WORK_DIM, NULL, global, local, 0, NULL, NULL);
+        check_error(err,"clEnqueueNDRangeKernel");
+      }
+
+      /* - - -
+      Sync & check
+      - - - */
+
+       // Wait for the command queue to get serviced before reading back results
+      clFinish(queue);
+
+
+      /* - - -
+      Sync & check
+      - - - */
+
+       // Wait for the command queue to get serviced before reading back results
+      clFinish(queue);
+
+      if (i % 2 == 1)
+         thapi_ctl_stop();
+
+      offset += batch_size;
+    }
+
+    //
+    // /  |  _   _. ._  o ._   _  
+    // \_ | (/_ (_| | | | | | (_| 
+    //                         _|
+    free(devices);
+    free(platforms);
+    clReleaseCommandQueue(queue);
+    clReleaseMemObject(d_a);
+    clReleaseContext(context);
+
+    if (run_kernel) {
+      clReleaseProgram(program);
+      clReleaseKernel(kernel);
+    }
+
+    // Exit
+    return 0;
+}
+
+// =================================================================================================
diff --git a/integration_tests/libthapi-ctl/omp_target/Makefile b/integration_tests/libthapi-ctl/omp_target/Makefile
new file mode 100644
index 00000000..d35e134b
--- /dev/null
+++ b/integration_tests/libthapi-ctl/omp_target/Makefile
@@ -0,0 +1,15 @@
+CXXFLAGS = -g -Wall -Wextra $(WERROR) -I$(THAPI_INSTALL_DIR)/include
+LDFLAGS = -L$(THAPI_INSTALL_DIR)/lib -lthapi-ctl -L$(LTTNG_INSTALL_DIR)/lib -llttng-ctl
+
+vec_cuda: vec.cpp
+	clang++ $(CXXFLAGS) $(LDFLAGS) -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda $^ -o $@
+
+vec_intel: vec.cpp
+	icpx $(CXXFLAGS) $(LDFLAGS) -qopenmp -fopenmp-targets=spir64 $^ -o $@
+
+vec_hip: vec.cpp
+	amdclang++ $(CXXFLAGS) $(LDFLAGS) -fopenmp --offload-arch=gfx1030 $^ -o $@
+
+.PHONY: clean
+clean:
+	rm -f vec_cuda vec_intel vec_hip
diff --git a/integration_tests/libthapi-ctl/omp_target/vec.cpp b/integration_tests/libthapi-ctl/omp_target/vec.cpp
new file mode 100644
index 00000000..d9e6841d
--- /dev/null
+++ b/integration_tests/libthapi-ctl/omp_target/vec.cpp
@@ -0,0 +1,44 @@
+/* Copyright (c) 2019 CSC Training */
+/* Copyright (c) 2021 ENCCS */
+#include <stdio.h>
+#include <math.h>
+
+#include "thapi-ctl.h"
+
+#define NX 102400
+
+#define NITER 10
+
+int main(void)
+{
+  double vecA[NX],vecB[NX],vecC[NX];
+  double r=0.2;
+
+/* Initialization of vectors */
+  for (int i = 0; i < NX; i++) {
+     vecA[i] = pow(r, i);
+     vecB[i] = 1.0;
+  }
+
+  for (int n = 0; n < NITER; n++) {
+    if (n % 2 == 1)
+      thapi_ctl_start();
+
+    /* dot product of two vectors */
+    #pragma omp target teams distribute parallel for
+    for (int i = 0; i < NX; i++) {
+       vecC[i] = vecA[i] * vecB[i];
+    }
+
+    if (n % 2 == 1)
+      thapi_ctl_stop();
+  }
+
+  double sum = 0.0;
+  /* calculate the sum */
+  for (int i = 0; i < NX; i++) {
+    sum += vecC[i];
+  }
+  printf("The sum is: %8.6f \n", sum);
+  return 0;
+}
diff --git a/libthapictl/Makefile.am b/libthapictl/Makefile.am
new file mode 100644
index 00000000..b7cb93f4
--- /dev/null
+++ b/libthapictl/Makefile.am
@@ -0,0 +1,29 @@
+if STRICT
+  WERROR = -Werror
+else
+  WERROR =
+endif
+
+include_HEADERS = \
+	include/thapi-ctl.h \
+	include/backends.h
+
+lib_LTLIBRARIES = libthapi-ctl.la
+
+libthapi_ctl_la_SOURCES = src/startstop.c src/backends.c
+libthapi_ctl_la_CFLAGS = -Wall -Wextra $(WERROR) -I$(srcdir)/include -I$(builddir) -I$(top_srcdir)/utils/include $(LTTNG_CTL_CFLAGS)
+libthapi_ctl_la_LDFLAGS = $(LTTNG_CTL_LIBS)
+
+bin_PROGRAMS = thapi-ctl
+
+thapi_ctl_SOURCES = src/main.c
+thapi_ctl_CFLAGS = -Wall -Wextra $(WERROR) -I$(srcdir)/include -I$(builddir) -I$(top_srcdir)/utils/include $(LTTNG_CTL_CFLAGS)
+thapi_ctl_LDFLAGS = -L$(builddir) -lthapi-ctl $(LTTNG_CTL_LIBS)
+thapi_ctl_DEPENDENCIES = libthapi-ctl.la
+
+check_PROGRAMS = test-thapi-ctl
+test_thapi_ctl_SOURCES = tests/test-thapi-ctl.c
+test_thapi_ctl_CFLAGS = -Wall -Wextra $(WERROR) -I$(srcdir)/include $(LTTNG_CTL_CFLAGS)
+test_thapi_ctl_LDFLAGS = -L$(builddir) -Wl,-rpath,$(builddir) -lthapi-ctl $(LTTNG_CTL_LIBS)
+
+TESTS = $(check_PROGRAMS)
diff --git a/libthapictl/README.md b/libthapictl/README.md
new file mode 100644
index 00000000..967f25ae
--- /dev/null
+++ b/libthapictl/README.md
@@ -0,0 +1,75 @@
+# libthapictl
+
+This library is desiged to allow applications to control event collection, similar to
+how cudaProfilerStart and cudaProfilerStop work.
+
+## Design Considerations
+
+The simplest design would be to stop and start the entire lttng session. However many
+backends use complex chains of API calls to figure out the device context of things like
+memory operations and kernel launches on device. Collection of events for these API
+calls must never be disabled or thapi will be unable to correctly identify future
+device operations. We call these API calls "bookkeeping" calls.
+
+In addition to "bookkeeping" events, there are additional categories for backends that
+support "minimal", "default", and/or "full" tracing modes, and device profiling can also
+be enabled or disabled independently. This creates further categories.
+
+### Design 1: two sessions
+
+The initial idea was to create two lttng sessions, one that would collect all the
+"bookkeeping" events and one that would collect everything else (the "main" session). Then
+thapi_ctl_start/stop could simply start/stop the main session, and always leave the
+bookkeeping session running. This worked well enough for CUDA, except when the
+"--no-tracing-from-start" option was used. This option causes start to not be run
+unless the user calls it, so during cuInit, only the "bookkeeping" session is active.
+
+The CUDA runtime in particular has special handling for dynamic symbols, using the
+cuGetProcAddress_v2 API call as the hook to wrap dynamically loaded symbols. This works
+fine if both bookkeeping and main sessions are running when cuGetProcAddress loads all
+the symbols (during cuInit, triggered by first CUDA runtime call), and both are configured
+to intercept the entry/exit events for that call. However if main is not started, then
+it never gets to wrap the dynamic symbols, and cuGetProcAddress is not called again.
+
+OpenCL does not have this complication, and works well with this method. ZE may be similarly complex to CUDA, and HIP may also be once we add device profiling.
+
+We could not support "--no-tracing-from-start", and make it the user's responsibility to
+call thapi_ctl_stop in the beginning of the program but AFTER cuInit and similar, but this
+is fragile and hard to document. Another possiblility would be to try to force cuInit
+right after tracer init time, but this is also a very invasive thing for a tracing
+framework to do. For this reason, we came up with another design:
+
+### Design 2: fine grained event enable/disable
+
+Rather than put "bookkeeping" in a different category, we can have libthapictl know which
+events are bookkeeping and which events to enable/disable for start stop. To avoid
+duplicating this knowledge in xprof ruby code AND in libthapictl, all this knowledge
+can be moved into libthapictl, and a "thapi-ctl" binary that is a thin wrapper around
+the library start/stop can be called from xprof.
+
+There are three commands:
+
+init: enable bookkeeping events (always called)
+start: enable tracing events, according to options requested by user (tracing mode, device profiling on/off, which backends are enabled)
+stop: disable tracing events
+
+By default tracing from start is enabled, so both init and start are called for each enabled
+backend. Then the profiled application can link libthapi-ctl and call thapi_ctl_stop/start
+as desired.
+
+This approach is more complex than (1), but it is far more likely to work for all backends
+without gotchas for the user like call stop before init breaking everything.
+
+## Environment variables
+
+Things like the LTTNG_HOME, session name, and channel name, enabled backends, and tracing
+mode are passed from xprof to thapi-ctl and libthapictl via environment variables, which
+are memoized with static C functions / static function vars.
+
+## Testing
+
+This is a very invasive change to THAPI; in addition to testing the new functionality of
+starting/stopping tracing, we need to test that existing functionality like tracing modes
+is not broken. To this end new integration tests are being added, that can't be run on
+CI because of hardware requirements but should be easy to run manually on machines with
+appropriate hardware. See libthapictl and gtensor integration test directories.
diff --git a/libthapictl/include/backends.h b/libthapictl/include/backends.h
new file mode 100644
index 00000000..2a69e109
--- /dev/null
+++ b/libthapictl/include/backends.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <lttng/lttng.h>
+
+void thapi_cuda_init(struct lttng_handle *h, const char *channel_name);
+void thapi_cuda_enable_tracing_events(struct lttng_handle *h, const char *channel_name);
+void thapi_cuda_disable_tracing_events(struct lttng_handle *h, const char *channel_name);
+
+void thapi_opencl_init(struct lttng_handle *h, const char *channel_name);
+void thapi_opencl_enable_tracing_events(struct lttng_handle *h, const char *channel_name);
+void thapi_opencl_disable_tracing_events(struct lttng_handle *h, const char *channel_name);
+
+void thapi_omp_init(struct lttng_handle *h, const char *channel_name);
+void thapi_omp_enable_tracing_events(struct lttng_handle *h, const char *channel_name);
+void thapi_omp_disable_tracing_events(struct lttng_handle *h, const char *channel_name);
+
+void thapi_hip_init(struct lttng_handle *h, const char *channel_name);
+void thapi_hip_enable_tracing_events(struct lttng_handle *h, const char *channel_name);
+void thapi_hip_disable_tracing_events(struct lttng_handle *h, const char *channel_name);
diff --git a/libthapictl/include/thapi-ctl.h b/libthapictl/include/thapi-ctl.h
new file mode 100644
index 00000000..69999cef
--- /dev/null
+++ b/libthapictl/include/thapi-ctl.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#define THAPI_CTL_LOG_LEVEL_ERROR 1
+#define THAPI_CTL_LOG_LEVEL_INFO 2
+#define THAPI_CTL_LOG_LEVEL_WARN 3
+#define THAPI_CTL_LOG_LEVEL_DEBUG 4
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/**
+ * Initialize bookkeeping events for enabled backends, if run with iprof
+ *
+ * Does nothing if not run with iprof. This is detected using the
+ * THAPI_LTTNG_SESSION_ID environment variable.
+ *
+ * @param enable_tracing: if false, enable only bookkeeping events, otherwise
+ *  enable all events
+ *
+ * @return 0 on success
+ */
+int thapi_ctl_init();
+
+
+/**
+ * Start tracing host and device events, when application is run with iprof.
+ *
+ * Does nothing if not run with iprof. This is detected using the
+ * THAPI_LTTNG_SESSION_ID environment variable.
+ *
+ * @return 0 on success
+ */
+int thapi_ctl_start();
+
+/**
+ * Stop tracing host and device events, when application is run with iprof.
+ *
+ * Does nothing if not run with iprof. This is detected using the
+ * THAPI_LTTNG_SESSION_ID environment variable.
+ *
+ * @return 0 on success
+ */
+int thapi_ctl_stop();
+
+
+/**
+ * Destroy any lttng handles associated with thapi-ctl.
+ *
+ * @return 0 on success
+ */
+void thapi_ctl_destroy();
+
+
+/**
+ * Log to stderr according to configured log level.
+ *
+ * Log level is set with THAPI_CTL_LOG_LEVEL env variable.
+ *
+ * @param log_level: one of THAPI_CTL_LOG_LEVEL_(DEBUG|INFO|WARN|ERROR)
+ * @param fmt: printf style format
+ * @param ...: printf format arguments
+ */
+void thapi_ctl_log(int log_level, const char *fmt, ...);
+
+
+void thapi_ctl_print_events();
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/libthapictl/src/backends.c b/libthapictl/src/backends.c
new file mode 100644
index 00000000..1947fbfe
--- /dev/null
+++ b/libthapictl/src/backends.c
@@ -0,0 +1,396 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <lttng/lttng.h>
+
+#include "thapi.h"
+#include "thapi-ctl.h"
+
+
+enum tracing_mode_e {
+  THAPI_CTL_TRACING_MODE_UNSET=0,
+  THAPI_CTL_TRACING_MODE_MINIMAL=1,
+  THAPI_CTL_TRACING_MODE_DEFAULT=2,
+  THAPI_CTL_TRACING_MODE_FULL=3,
+};
+typedef enum tracing_mode_e tracing_mode_t;
+
+
+#define ARRAY_LENGTH(a) (sizeof(a) / sizeof(*a));
+
+
+/*
+ *  "lttng_ust_cuda:cuDriverGetVersion_entry",
+  "lttng_ust_cuda:cuDriverGetVersion_exit",
+  "lttng_ust_cuda:cuDriverGetAttribute_entry",
+  "lttng_ust_cuda:cuDriverGetAttribute_exit",
+  "lttng_ust_cuda:cuInit_entry",
+  "lttng_ust_cuda:cuInit_exit",
+  "lttng_ust_cuda:cuGetExportTable_entry",
+  "lttng_ust_cuda:cuGetExportTable_exit",
+  "lttng_ust_cuda:cuCtxGetCurrent_entry",
+  "lttng_ust_cuda:cuCtxGetCurrent_exit",
+  */
+
+static char* cuda_bookkeeping_events[] = {
+  "lttng_ust_cuda:cuGetProcAddress_v2_entry",
+  "lttng_ust_cuda:cuGetProcAddress_v2_exit",
+  "lttng_ust_cuda:cuCtxCreate_entry",
+  "lttng_ust_cuda:cuCtxCreate_exit",
+  "lttng_ust_cuda:cuCtxCreate_v2_entry",
+  "lttng_ust_cuda:cuCtxCreate_v2_exit",
+  "lttng_ust_cuda:cuCtxCreate_v3_entry",
+  "lttng_ust_cuda:cuCtxCreate_v3_exit",
+  "lttng_ust_cuda:cuCtxDestroy_entry",
+  "lttng_ust_cuda:cuCtxDestroy_exit",
+  "lttng_ust_cuda:cuCtxDestroy_v2_entry",
+  "lttng_ust_cuda:cuCtxDestroy_v2_exit",
+  "lttng_ust_cuda:cuCtxSetCurrent_entry",
+  "lttng_ust_cuda:cuCtxSetCurrent_exit",
+  "lttng_ust_cuda:cuCtxPushCurrent_entry",
+  "lttng_ust_cuda:cuCtxPushCurrent_exit",
+  "lttng_ust_cuda:cuCtxPushCurrent_v2_entry",
+  "lttng_ust_cuda:cuCtxPushCurrent_v2_exit",
+  "lttng_ust_cuda:cuCtxPopCurrent_entry",
+  "lttng_ust_cuda:cuCtxPopCurrent_exit",
+  "lttng_ust_cuda:cuCtxPopCurrent_v2_entry",
+  "lttng_ust_cuda:cuCtxPopCurrent_v2_exit",
+  "lttng_ust_cuda:cuDevicePrimaryCtxRetain_entry",
+  "lttng_ust_cuda:cuDevicePrimaryCtxRetain_exit",
+  "lttng_ust_cuda:cuStreamCreate_entry",
+  "lttng_ust_cuda:cuStreamCreate_exit",
+  "lttng_ust_cuda:cuStreamCreateWithPriority_entry",
+  "lttng_ust_cuda:cuStreamCreateWithPriority_exit",
+  "lttng_ust_cuda:cuModuleGetFunction_entry",
+  "lttng_ust_cuda:cuModuleGetFunction_exit",
+  "lttng_ust_cuda_profiling:event_profiling_results",
+};
+
+
+// TODO: encapsulate all env vars into a single struct
+static tracing_mode_t thapi_ctl_tracing_mode() {
+  static tracing_mode_t mode = THAPI_CTL_TRACING_MODE_UNSET;
+  if (mode == THAPI_CTL_TRACING_MODE_UNSET) {
+    const char *mode_env = getenv("LTTNG_UST_TRACING_MODE");
+    if (mode_env == NULL) {
+      mode = THAPI_CTL_TRACING_MODE_DEFAULT;
+    } else if (strcmp(mode_env, "minimal") == 0) {
+      mode = THAPI_CTL_TRACING_MODE_MINIMAL;
+    } else if (strcmp(mode_env, "full") == 0) {
+      mode = THAPI_CTL_TRACING_MODE_FULL;
+    } else if (strcmp(mode_env, "default") == 0) {
+      mode = THAPI_CTL_TRACING_MODE_DEFAULT;
+    } else {
+      thapi_ctl_log(THAPI_CTL_LOG_LEVEL_WARN,
+                    "Unknown tracing mode: %s, using default", mode_env);
+      mode = THAPI_CTL_TRACING_MODE_DEFAULT;
+    }
+  }
+  return mode;
+}
+
+
+static int thapi_ctl_profiling() {
+  static int profiling = -1;
+  if (profiling == -1) {
+    const char *profiling_env = getenv("LTTNG_UST_PROFILING");
+    if (profiling_env == NULL || strcmp(profiling_env, "1") == 0) {
+      profiling = 1;
+    } else {
+      profiling = 0;
+    }
+  }
+  return profiling;
+}
+
+
+static char* cuda_tracing_events[] = {
+  "lttng_ust_cuda:*",
+  "lttng_ust_cuda_properties",
+  "lttng_ust_cuda_profiling:event_profiling",
+};
+
+
+static inline int is_wildcard(const char *event_pattern) {
+  return (strchr(event_pattern, '*') != NULL);
+}
+
+
+static void thapi_enable_events_by_pattern(struct lttng_handle* handle,
+                                           struct lttng_event* ev,
+                                           int n_patterns,
+                                           char** event_patterns,
+                                           const char* channel_name,
+                                           int exclusion_count,
+                                           char **exclusion_list) {
+  memset(ev, 0, sizeof(*ev));
+  ev->type = LTTNG_EVENT_TRACEPOINT;
+
+  for (int i = 0; i < n_patterns; i++) {
+    const char *pattern = event_patterns[i];
+    // thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, "Enabling event '%s' on channel '%s'",
+    //               pattern, channel_name);
+    strncpy(ev->name, pattern, LTTNG_SYMBOL_NAME_LEN);
+    ev->name[LTTNG_SYMBOL_NAME_LEN - 1] = '\0';
+    int rval = 0;
+    if (exclusion_count > 0 && is_wildcard(pattern)) {
+      rval = lttng_enable_event_with_exclusions(handle, ev, channel_name, NULL,
+                                                exclusion_count, exclusion_list);
+    } else {
+      rval = lttng_enable_event_with_exclusions(handle, ev, channel_name, NULL,
+                                                0, NULL);
+    }
+
+    if (rval < 0 && rval != -LTTNG_ERR_UST_EVENT_ENABLED) {
+      thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR,
+                    "Failed to enable event '%s': %d",
+                    pattern, rval);
+    }
+  }
+}
+
+
+static void thapi_disable_events_by_pattern(struct lttng_handle* handle,
+                                            struct lttng_event* ev,
+                                            int n_patterns,
+                                            char** event_patterns,
+                                            const char* channel_name) {
+  memset(ev, 0, sizeof(*ev));
+  ev->type = LTTNG_EVENT_ALL; // match any event type
+  ev->loglevel = -1; // match any log level
+
+  for (int i = 0; i < n_patterns; i++) {
+    const char *pattern = event_patterns[i];
+    // thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, "Disabling event '%s' on channel '%s'",
+    //               pattern, channel_name);
+    strncpy(ev->name, pattern, LTTNG_SYMBOL_NAME_LEN);
+    ev->name[LTTNG_SYMBOL_NAME_LEN - 1] = '\0';
+    int rval = lttng_disable_event_ext(handle, ev, channel_name, NULL);
+    if (rval < 0) {
+      thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR,
+                    "Failed to disable event '%s': %d",
+                    pattern, rval);
+    }
+  }
+}
+
+
+void thapi_cuda_init(struct lttng_handle *h, const char *channel_name) {
+  struct lttng_event *ev = lttng_event_create();
+  if (ev == NULL) {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event");
+    return;
+  }
+
+  int n_bookkeeping_events = ARRAY_LENGTH(cuda_bookkeeping_events);
+  thapi_enable_events_by_pattern(h, ev, n_bookkeeping_events, cuda_bookkeeping_events,
+                                 channel_name, 0, NULL);
+
+  lttng_event_destroy(ev);
+}
+
+
+void thapi_cuda_enable_tracing_events(struct lttng_handle *h, const char *channel_name) {
+  struct lttng_event *ev = lttng_event_create();
+  if (ev == NULL) {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event");
+    return;
+  }
+
+  int n_bookkeeping_events = ARRAY_LENGTH(cuda_bookkeeping_events);
+  int n_tracing_events = ARRAY_LENGTH(cuda_tracing_events);
+  thapi_enable_events_by_pattern(h, ev, n_tracing_events, cuda_tracing_events, channel_name,
+                                 n_bookkeeping_events, cuda_bookkeeping_events);
+
+  lttng_event_destroy(ev);
+}
+
+
+void thapi_cuda_disable_tracing_events(struct lttng_handle *h, const char *channel_name) {
+  struct lttng_event *ev = lttng_event_create();
+  if (ev == NULL) {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event");
+    return;
+  }
+
+  int n_tracing_events = sizeof(cuda_tracing_events)
+                         / sizeof(*cuda_tracing_events);
+  thapi_disable_events_by_pattern(h, ev, n_tracing_events,
+                                  cuda_tracing_events, channel_name);
+
+  lttng_event_destroy(ev);
+}
+
+
+static char* opencl_bookkeeping_events[] = {
+  "lttng_ust_opencl:clGetDeviceIDs",
+  "lttng_ust_opencl:clCreatSubDevices",
+  "lttng_ust_opencl:clCreateCommandQueue",
+  "lttng_ust_opencl_arguments:kernel_info",
+  "lttng_ust_opencl_profiling:event_profiling_results"
+};
+
+static char* opencl_tracing_default_events[] = {
+  "lttng_ust_opencl_devices:*",
+  "lttng_ust_opencl_arguments:*",
+  "lttng_ust_opencl_build:infos*",
+};
+
+static char* opencl_tracing_default_exclude_events[] = {
+  "lttng_ust_opencl:clSetKernelArg*"
+  "lttng_ust_opencl:clGetKernelArg*",
+  "lttng_ust_opencl:clSetKernelExecInfo*"
+  "lttng_ust_opencl:clGetKernelInfo*",
+  "lttng_ust_opencl:clGetMemAllocInfoINTEL*"
+};
+
+static char* opencl_profiling_events[] = {
+  "lttng_ust_opencl_profiling:event_profiling",
+};
+
+
+void thapi_opencl_init(struct lttng_handle *h, const char *channel_name) {
+  struct lttng_event *ev = lttng_event_create();
+  if (ev == NULL) {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event");
+    return;
+  }
+
+  int n_bookkeeping_events = ARRAY_LENGTH(opencl_bookkeeping_events);
+  thapi_enable_events_by_pattern(h, ev, n_bookkeeping_events, opencl_bookkeeping_events,
+                                 channel_name, 0, NULL);
+
+  lttng_event_destroy(ev);
+}
+
+
+void thapi_opencl_enable_tracing_events(struct lttng_handle *h, const char *channel_name) {
+  tracing_mode_t mode = thapi_ctl_tracing_mode();
+
+  struct lttng_event *ev = lttng_event_create();
+  if (ev == NULL) {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event");
+    return;
+  }
+
+  char *host_events_pattern = "lttng_ust_opencl:*";
+  if (mode == THAPI_CTL_TRACING_MODE_MINIMAL) {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_WARN,
+                  "mode minimal no supproted by opencl backend, using default mode");
+    mode = THAPI_CTL_TRACING_MODE_DEFAULT;
+  }
+
+  if (mode == THAPI_CTL_TRACING_MODE_FULL) {
+    thapi_enable_events_by_pattern(h, ev, 1, &host_events_pattern, channel_name,
+                                   0, NULL);
+  } else { // MODE_DEFAULT, add same pattern but with exclusion list
+    int n_exclude_default = ARRAY_LENGTH(opencl_tracing_default_exclude_events);
+    thapi_enable_events_by_pattern(h, ev, 1, &host_events_pattern, channel_name,
+                                   n_exclude_default,
+                                   opencl_tracing_default_exclude_events);
+  }
+
+  int n_tracing_default = ARRAY_LENGTH(opencl_tracing_default_events);
+  thapi_enable_events_by_pattern(h, ev, n_tracing_default, opencl_tracing_default_events,
+                                 channel_name, 0, NULL);
+
+  if (thapi_ctl_profiling()) {
+    int n_profiling = ARRAY_LENGTH(opencl_profiling_events);
+    thapi_enable_events_by_pattern(h, ev, n_profiling, opencl_profiling_events,
+                                   channel_name, 0, NULL);
+  }
+
+  lttng_event_destroy(ev);
+}
+
+
+void thapi_opencl_disable_tracing_events(struct lttng_handle *h, const char *channel_name) {
+  struct lttng_event *ev = lttng_event_create();
+  if (ev == NULL) {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event");
+    return;
+  }
+
+  int n_tracing_default_events = sizeof(opencl_tracing_default_events)
+                               / sizeof(*opencl_tracing_default_events);
+  thapi_disable_events_by_pattern(h, ev, n_tracing_default_events,
+                                  opencl_tracing_default_events, channel_name);
+
+  char *host_events_pattern = "lttng_ust_opencl:*";
+  thapi_disable_events_by_pattern(h, ev, 1, &host_events_pattern, channel_name);
+
+  if (thapi_ctl_profiling()) {
+    int n_profiling = ARRAY_LENGTH(opencl_profiling_events);
+    thapi_disable_events_by_pattern(h, ev, n_profiling, opencl_profiling_events,
+                                    channel_name);
+  }
+
+  lttng_event_destroy(ev);
+}
+
+
+static char *omp_events_pattern = "lttng_ust_ompt:*target*";
+
+
+void thapi_omp_init(struct lttng_handle *h, const char *channel_name) {
+  (void)h;
+  (void)channel_name;
+  // no-op, no bookkeeping events for omp
+  return;
+}
+
+
+void thapi_omp_enable_tracing_events(struct lttng_handle *h, const char *channel_name) {
+  struct lttng_event *ev = lttng_event_create();
+  if (ev == NULL) {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event");
+    return;
+  }
+  thapi_enable_events_by_pattern(h, ev, 1, &omp_events_pattern, channel_name, 0, NULL);
+  lttng_event_destroy(ev);
+}
+
+
+void thapi_omp_disable_tracing_events(struct lttng_handle *h, const char *channel_name) {
+  struct lttng_event *ev = lttng_event_create();
+  if (ev == NULL) {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event");
+    return;
+  }
+  thapi_disable_events_by_pattern(h, ev, 1, &omp_events_pattern, channel_name);
+  lttng_event_destroy(ev);
+}
+
+
+static char *hip_events_pattern = "lttng_ust_hip:*";
+
+
+void thapi_hip_init(struct lttng_handle *h, const char *channel_name) {
+  (void)h;
+  (void)channel_name;
+  // no-op, no bookkeeping events for hip
+  return;
+}
+
+
+void thapi_hip_enable_tracing_events(struct lttng_handle *h, const char *channel_name) {
+  struct lttng_event *ev = lttng_event_create();
+  if (ev == NULL) {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event");
+    return;
+  }
+  thapi_enable_events_by_pattern(h, ev, 1, &hip_events_pattern, channel_name, 0, NULL);
+  lttng_event_destroy(ev);
+}
+
+
+void thapi_hip_disable_tracing_events(struct lttng_handle *h, const char *channel_name) {
+  struct lttng_event *ev = lttng_event_create();
+  if (ev == NULL) {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event");
+    return;
+  }
+  thapi_disable_events_by_pattern(h, ev, 1, &hip_events_pattern, channel_name);
+  lttng_event_destroy(ev);
+}
diff --git a/libthapictl/src/main.c b/libthapictl/src/main.c
new file mode 100644
index 00000000..898f3f4a
--- /dev/null
+++ b/libthapictl/src/main.c
@@ -0,0 +1,36 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "thapi-ctl.h"
+
+
+void print_usage(const char* argv0) {
+  fprintf(stderr, "Usage: %s init|start|stop\n", argv0);
+}
+
+
+int main(int argc, char **argv) {
+  if (argc != 2) {
+    print_usage(argv[0]);
+    exit(1);
+  }
+
+  const char* cmd = argv[1];
+  printf("cmd '%s'\n", cmd);
+  fflush(stdout);
+  if (strcmp(cmd, "init") == 0) {
+    thapi_ctl_init(); // enable bookkeeping
+  } else if (strcmp(cmd, "start") == 0) {
+    thapi_ctl_start();
+  } else if (strcmp(cmd, "stop") == 0) {
+    thapi_ctl_stop();
+  } else if (strcmp(cmd, "list") == 0) {
+    thapi_ctl_print_events();
+  } else {
+    print_usage(argv[0]);
+    fprintf(stderr, "Error: unknown command '%s'\n", cmd);
+    return 1;
+  }
+  return 0;
+}
diff --git a/libthapictl/src/startstop.c b/libthapictl/src/startstop.c
new file mode 100644
index 00000000..092419d7
--- /dev/null
+++ b/libthapictl/src/startstop.c
@@ -0,0 +1,371 @@
+#include <assert.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <lttng/lttng.h>
+
+#include "thapi.h"
+#include "thapi-ctl.h"
+#include "backends.h"
+
+
+#define THAPI_LTTNG_SESSION_ID_ENV "LTTNG_UST_SESSION_ID"
+#define THAPI_LTTNG_CHANNEL_NAME_ENV "LTTNG_UST_CHANNEL_NAME"
+
+
+#define THAPI_CTL_DEFAULT_LOG_LEVEL THAPI_CTL_LOG_LEVEL_ERROR
+
+
+static const char* thapi_ctl_level_str[5] = {
+  "",
+  "ERROR",
+  "INFO",
+  "WARN",
+  "DEBUG"
+};
+
+
+static int thapi_ctl_log_level() {
+  static int level = -1;
+  if (level < 0) {
+    char *log_level_str = getenv("THAPI_CTL_LOG_LEVEL");
+    if (log_level_str == NULL || strlen(log_level_str) == 0) {
+      level = THAPI_CTL_DEFAULT_LOG_LEVEL;
+    } else {
+      char *endptr;
+      level = strtol(log_level_str, &endptr, 10);
+      if (*endptr != '\0') {
+        level = THAPI_CTL_DEFAULT_LOG_LEVEL;
+      }
+      if (level > THAPI_CTL_LOG_LEVEL_DEBUG || level < THAPI_CTL_LOG_LEVEL_ERROR) {
+        fprintf(stderr,
+                "libthapi-ctl[WARN]: unknown log level '%s', falling back to ERROR",
+                log_level_str);
+        level = THAPI_CTL_DEFAULT_LOG_LEVEL;
+      }
+    }
+  }
+  return level;
+}
+
+
+static int* thapi_ctl_enabled_backends() {
+  static int enabled_backends[] = {-1, -1, -1, -1, -1, -1, -1 };
+  if (enabled_backends[BACKEND_UNKNOWN] == -1) {
+    enabled_backends[BACKEND_UNKNOWN] = 0;
+    enabled_backends[BACKEND_ZE] = (getenv("LTTNG_UST_BACKEND_ZE_ENABLED") != NULL);
+    enabled_backends[BACKEND_OPENCL] =
+      (getenv("LTTNG_UST_BACKEND_OPENCL_ENABLED") != NULL);
+    enabled_backends[BACKEND_CUDA] = (getenv("LTTNG_UST_BACKEND_CUDA_ENABLED") != NULL);
+    enabled_backends[BACKEND_OMP_TARGET_OPERATIONS] =
+      (getenv("LTTNG_UST_BACKEND_OMP_TARGET_OPERATIONS_ENABLED") != NULL);
+    enabled_backends[BACKEND_OMP] = (getenv("LTTNG_UST_BACKEND_OMP_ENABLED") != NULL);
+    enabled_backends[BACKEND_HIP] = (getenv("LTTNG_UST_BACKEND_HIP_ENABLED") != NULL);
+  }
+  return enabled_backends;
+}
+
+
+static void log_events(struct lttng_handle* handle, const char *channel_name,
+                       const char *prefix) {
+  int ret;
+  struct lttng_event *event_list = NULL;
+  struct lttng_event *event = NULL;
+  thapi_ctl_log(THAPI_CTL_LOG_LEVEL_INFO, "channel name %s", channel_name);
+  int n_events = lttng_list_events(handle, channel_name, &event_list);
+  if (n_events < 0) {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "[%s] error getting events: %d",
+                  prefix, n_events);
+    return;
+  }
+  thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, "[%s] %d events", prefix, n_events);
+  const char *filter_str;
+  for (int i = 0; i < n_events; i++) {
+    event = &event_list[i];
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, "[%s] %s (enabled: %d)",
+                  prefix, event->name, event->enabled);
+    ret = lttng_event_get_filter_expression(event, &filter_str);
+    if (ret) {
+      thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "can't get filter expr %d", ret);
+    } else if (filter_str) {
+      thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, "[%s]   + filter str '%s'",
+                    prefix, filter_str);
+    }
+
+    int n_exclusions = lttng_event_get_exclusion_name_count(event);
+    if (n_exclusions < 0) {
+      thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR,
+                    "can't get exclusion name count %d", n_exclusions);
+    } else if (n_exclusions > 0) {
+      for (int j = 0; j < n_exclusions; j++) {
+        const char *exclusion_name;
+        ret = lttng_event_get_exclusion_name(event, j, &exclusion_name);
+        if (ret < 0) {
+          thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR,
+                        "can't get exclusion name[%d] %d", j, ret);
+        } else {
+          thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, "[%s]   + exclude %02d '%s'",
+                        prefix, j, exclusion_name);
+        }
+      }
+    }
+  }
+  if (event_list != NULL)
+    free(event_list);
+}
+
+
+void thapi_ctl_log(int log_level, const char *fmt, ...) {
+  assert(log_level >= THAPI_CTL_LOG_LEVEL_ERROR && log_level <= THAPI_CTL_LOG_LEVEL_DEBUG);
+  int config_level = thapi_ctl_log_level();
+  if (log_level <= config_level) {
+    fprintf(stderr, "libthapi-ctl[%s]: ", thapi_ctl_level_str[log_level]);
+    va_list va;
+    va_start(va, fmt);
+    vfprintf(stderr, fmt, va);
+    va_end(va);
+    fprintf(stderr, "\n");
+  }
+  if (log_level >= THAPI_CTL_LOG_LEVEL_DEBUG) {
+    fflush(stderr);
+  }
+}
+
+
+static const char* thapi_ctl_lttng_session_id() {
+  static char *session_id = NULL;
+  if (session_id == NULL) {
+    // first call, initialize static vars
+    session_id = getenv(THAPI_LTTNG_SESSION_ID_ENV);
+    if (session_id == NULL) {
+      // not called with iprof
+      session_id = "\0";
+      thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG,
+                    "No env var '%s', application not launched using iprof, disable",
+                    THAPI_LTTNG_SESSION_ID_ENV);
+    } else {
+      thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG,
+                    "THAPI session id: %s", session_id);
+    }
+  }
+  return session_id;
+}
+
+
+static const char* thapi_ctl_lttng_channel_name() {
+  static char *channel_name = NULL;
+  if (channel_name == NULL) {
+    // first call, initialize static vars
+    channel_name = getenv(THAPI_LTTNG_CHANNEL_NAME_ENV);
+    if (channel_name == NULL) {
+      // not called with iprof
+      channel_name = "\0";
+      thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG,
+                    "No env var '%s', application not launched using iprof, disable",
+                    THAPI_LTTNG_CHANNEL_NAME_ENV);
+    } else {
+      thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG,
+                    "THAPI channel: %s", channel_name);
+    }
+  }
+  return channel_name;
+}
+
+
+static struct lttng_handle* thapi_ctl_create_lttng_handle(const char *session_id) {
+  struct lttng_handle* handle = NULL;
+  if (strlen(session_id) > 0) {
+    struct lttng_domain dom;
+    memset(&dom, 0, sizeof(dom));
+    dom.type = LTTNG_DOMAIN_UST;
+    dom.buf_type = LTTNG_BUFFER_PER_UID;
+    handle = lttng_create_handle(session_id, &dom);
+    if (handle == NULL) {
+      thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR,
+                   "Failed to create lttng handle for session '%s'",
+                   session_id);
+
+    }
+  }
+  return handle;
+}
+
+
+void thapi_ctl_destroy_lttng_handle(struct lttng_handle *handle) {
+  if (handle != NULL) {
+    lttng_destroy_handle(handle);
+  }
+}
+
+
+int thapi_ctl_init() {
+  const char *session_id = thapi_ctl_lttng_session_id();
+  const char *channel_name = thapi_ctl_lttng_channel_name();
+  struct lttng_handle *handle = thapi_ctl_create_lttng_handle(session_id);
+  if (handle == NULL)
+    return 0;
+
+  thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, "init backends '%s' (channel '%s'",
+                session_id, channel_name);
+  log_events(handle, channel_name, "before");
+  int *backends_enabled = thapi_ctl_enabled_backends();
+  if (backends_enabled[BACKEND_CUDA]) {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_INFO, "cuda enabled");
+    thapi_cuda_init(handle, channel_name);
+  } else {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_INFO, "cuda DISABLED");
+  }
+  if (backends_enabled[BACKEND_OPENCL]) {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_INFO, "opencl enabled");
+    thapi_opencl_init(handle, channel_name);
+  } else {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_INFO, "opencl DISABLED");
+  }
+  if (backends_enabled[BACKEND_OMP]) {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_INFO, "omp enabled");
+    thapi_omp_init(handle, channel_name);
+  } else {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_INFO, "omp DISABLED");
+  }
+  if (backends_enabled[BACKEND_HIP]) {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_INFO, "hip enabled");
+    thapi_hip_init(handle, channel_name);
+  } else {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_INFO, "hip DISABLED");
+  }
+  log_events(handle, channel_name, "after");
+  thapi_ctl_destroy_lttng_handle(handle);
+  return 0;
+}
+
+
+int thapi_ctl_start() {
+  const char *session_id = thapi_ctl_lttng_session_id();
+  const char *channel_name = thapi_ctl_lttng_channel_name();
+  struct lttng_handle *handle = thapi_ctl_create_lttng_handle(session_id);
+  if (handle == NULL)
+    return 0;
+
+  thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, "start tracing '%s' (channel '%s')",
+                session_id, channel_name);
+  log_events(handle, channel_name, "before");
+  int *backends_enabled = thapi_ctl_enabled_backends();
+  if (backends_enabled[BACKEND_CUDA]) {
+    thapi_cuda_enable_tracing_events(handle, channel_name);
+  }
+  if (backends_enabled[BACKEND_OPENCL]) {
+    thapi_opencl_enable_tracing_events(handle, channel_name);
+  }
+  if (backends_enabled[BACKEND_OMP]) {
+    thapi_omp_enable_tracing_events(handle, channel_name);
+  }
+  if (backends_enabled[BACKEND_HIP]) {
+    thapi_hip_enable_tracing_events(handle, channel_name);
+  }
+  log_events(handle, channel_name, "after");
+  thapi_ctl_destroy_lttng_handle(handle);
+  return 0;
+}
+
+
+int thapi_ctl_stop() {
+  const char *session_id = thapi_ctl_lttng_session_id();
+  const char *channel_name = thapi_ctl_lttng_channel_name();
+  struct lttng_handle *handle = thapi_ctl_create_lttng_handle(session_id);
+  if (handle == NULL)
+    return 0;
+
+  thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, "stop tracing '%s' (channel '%s')",
+                session_id, channel_name);
+  log_events(handle, channel_name, "before");
+
+  int *backends_enabled = thapi_ctl_enabled_backends();
+  if (backends_enabled[BACKEND_CUDA]) {
+    thapi_cuda_disable_tracing_events(handle, channel_name);
+  }
+  if (backends_enabled[BACKEND_OPENCL]) {
+    thapi_opencl_disable_tracing_events(handle, channel_name);
+  }
+  if (backends_enabled[BACKEND_OMP]) {
+    thapi_omp_disable_tracing_events(handle, channel_name);
+  }
+  if (backends_enabled[BACKEND_HIP]) {
+    thapi_hip_disable_tracing_events(handle, channel_name);
+  }
+  log_events(handle, channel_name, "after");
+  thapi_ctl_destroy_lttng_handle(handle);
+  return 0;
+}
+
+
+void thapi_ctl_print_events() {
+  const char *session_id = thapi_ctl_lttng_session_id();
+  const char *channel_name = thapi_ctl_lttng_channel_name();
+  struct lttng_handle *handle = thapi_ctl_create_lttng_handle(session_id);
+  if (handle == NULL)
+    return;
+
+  int ret;
+  struct lttng_event *event_list = NULL;
+  int n_events = lttng_list_events(handle, channel_name, &event_list);
+  if (n_events < 0) {
+    thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "error getting events: %d",
+                  n_events);
+    return;
+  }
+  printf("%d events\n", n_events);
+  const char *filter_str;
+  for (int i = 0; i < n_events; i++) {
+    struct lttng_event *event = &event_list[i];
+    printf("%s (enabled: %d)", event->name, event->enabled);
+    ret = lttng_event_get_filter_expression(event, &filter_str);
+    if (ret != 0) {
+      fprintf(stderr, "Error getting filter expression: %d", ret);
+    } else if (filter_str) {
+      printf(" (filter '%s')", filter_str);
+    }
+
+    int n_exclusions = lttng_event_get_exclusion_name_count(event);
+    if (n_exclusions < 0) {
+      thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR,
+                    "can't get exclusion name count %d", n_exclusions);
+    } else if (n_exclusions > 0) {
+      // make room for max event name lengths plus commas and null terminator
+      size_t exclusions_string_max_length = n_exclusions * (LTTNG_SYMBOL_NAME_LEN + 1);
+      char *exclusions_string = malloc(exclusions_string_max_length);
+      assert(exclusions_string != NULL);
+      char *exclusions_offset = exclusions_string;
+      for (int j = 0; j < n_exclusions; j++) {
+        const char *exclusion_name;
+        ret = lttng_event_get_exclusion_name(event, j, &exclusion_name);
+        if (ret < 0) {
+          thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR,
+                        "can't get exclusion name[%d] %d", j, ret);
+          break;
+        } else {
+          int nchar = snprintf(exclusions_offset, LTTNG_SYMBOL_NAME_LEN,
+                               "%s", exclusion_name);
+          if (nchar < 0 || nchar >= LTTNG_SYMBOL_NAME_LEN) {
+            thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR,
+                          "Error formatting event exclusions: ", j, nchar);
+            break;
+          } else {
+            exclusions_offset += nchar;
+          }
+          if (j < n_exclusions  - 1) {
+            sprintf(exclusions_offset, ",");
+            exclusions_offset++;
+          }
+        }
+        assert(exclusions_offset < (exclusions_string + exclusions_string_max_length));
+        *exclusions_offset = '\0';
+      }
+      printf(" (exclude '%s')", exclusions_string);
+    }
+    printf("\n");
+  }
+  if (event_list != NULL)
+    free(event_list);
+}
diff --git a/libthapictl/tests/Makefile b/libthapictl/tests/Makefile
new file mode 100644
index 00000000..10276e3b
--- /dev/null
+++ b/libthapictl/tests/Makefile
@@ -0,0 +1,19 @@
+
+.PHONY: all
+all: hello hello-ze
+
+hello-tp.o: hello-tp.c hello-tp.h
+	gcc -c -I. hello-tp.c
+
+hello.o: hello.c
+	gcc -c -I../ hello.c
+
+hello: hello.o hello-tp.o
+	gcc -o hello hello.o hello-tp.o -llttng-ust -ldl -L../ -lthapi-ctl -llttng-ctl
+
+hello-ze: hello-ze.cxx
+	icpx -fsycl -o $@ $< -I../ -llttng-ctl -Wl,-rpath,$(THAPI_HOME)/lib -L$(THAPI_HOME)/lib -lthapi-ctl
+
+.PHONY: clean
+clean:
+	rm -rf hello *.o
diff --git a/libthapictl/tests/test-thapi-ctl.c b/libthapictl/tests/test-thapi-ctl.c
new file mode 100644
index 00000000..2a23d098
--- /dev/null
+++ b/libthapictl/tests/test-thapi-ctl.c
@@ -0,0 +1,19 @@
+#include <stdio.h>
+
+#include "thapi-ctl.h"
+
+
+int main(int argc, char **argv) {
+  (void) argc;
+  (void) argv;
+  // TODO: is there something we can do here that requires no "backed"?
+  // maybe can do omp CPU?
+  // for now just makes sure can compile and run without blowing up
+  for (int i = 0; i < 10; i++) {
+    if (i % 2 == 1)
+      thapi_ctl_start();
+    printf("loop %d\n", i);
+    if (i % 2 == 1)
+      thapi_ctl_stop();
+  }
+}
diff --git a/utils/Makefile.am b/utils/Makefile.am
index b5df32e9..266a5e47 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -4,6 +4,9 @@ else
   WERROR =
 endif
 
+include_HEADERS = \
+	include/thapi.h
+
 noinst_HEADERS = \
 	include/utarray.h \
 	include/uthash.h \
diff --git a/utils/include/thapi.h b/utils/include/thapi.h
new file mode 100644
index 00000000..08830eca
--- /dev/null
+++ b/utils/include/thapi.h
@@ -0,0 +1,13 @@
+#pragma once
+
+enum backend_e {
+  BACKEND_UNKNOWN = 0,
+  BACKEND_ZE = 1,
+  BACKEND_OPENCL = 2,
+  BACKEND_CUDA = 3,
+  BACKEND_OMP_TARGET_OPERATIONS = 4,
+  BACKEND_OMP = 5,
+  BACKEND_HIP = 6,
+};
+typedef enum backend_e backend_t;
+typedef unsigned backend_level_t;
diff --git a/utils/xprof_utils.hpp b/utils/xprof_utils.hpp
index d5f88d90..cefba4c8 100644
--- a/utils/xprof_utils.hpp
+++ b/utils/xprof_utils.hpp
@@ -12,17 +12,7 @@
 #include <unordered_map>
 #include <vector>
 
-enum backend_e {
-  BACKEND_UNKNOWN = 0,
-  BACKEND_ZE = 1,
-  BACKEND_OPENCL = 2,
-  BACKEND_CUDA = 3,
-  BACKEND_OMP_TARGET_OPERATIONS = 4,
-  BACKEND_OMP = 5,
-  BACKEND_HIP = 6,
-};
-typedef enum backend_e backend_t;
-typedef unsigned backend_level_t;
+#include "thapi.h"
 
 // Should be ordered by the value of the backend_e
 const std::unordered_map<std::string, backend_t> pretty_backend_name_g = {
diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in
index 9047363d..e3f2547a 100755
--- a/xprof/xprof.rb.in
+++ b/xprof/xprof.rb.in
@@ -49,9 +49,40 @@ def exec(cmd, opts: {}, debug: true)
 
   LOGGER.warn { stderr_str.strip } unless stderr_str.empty?
   LOGGER.debug { stdout_str.strip } unless stdout_str.empty?
+
   stdout_str
 end
 
+def lttng_env()
+  { 'LTTNG_HOME' => lttng_home_dir }
+end
+
+# TODO: should we just apply this to ENV once and use everywhere?
+def thapi_env(backends, tracing_mode, trace_from_start, profiling)
+  rval = lttng_env.dup
+  rval.merge!({
+    'LTTNG_UST_SESSION_ID' => lttng_session_uuid,
+    'LTTNG_UST_CHANNEL_NAME' => lttng_blocking_channel_name,
+    'LTTNG_UST_TRACING_MODE' => tracing_mode,
+    'LTTNG_UST_TRACE_FROM_START' => trace_from_start.to_s,
+    # TODO: combine with LTTNG_UST_{be}_PROFILE???
+    'LTTNG_UST_PROFILING' => profiling ? "1" : "0"
+  })
+  backends.each do |name|
+    rval["LTTNG_UST_BACKEND_#{name.upcase}_ENABLED"] = '1'
+  end
+  rval
+end
+
+def exec_thapi_ctl(command, backends, tracing_mode, trace_from_start, profiling)
+  ENV.merge!(thapi_env(backends, tracing_mode, trace_from_start, profiling))
+  # exec("lttng list --channel #{channel_name} --userspace #{lttng_session_uuid}")
+  # exec("#{BINDIR}/thapi-ctl list")
+  exec("#{BINDIR}/thapi-ctl #{command}")
+  # exec("#{BINDIR}/thapi-ctl list")
+  # exec("lttng list --channel #{channel_name} --userspace #{lttng_session_uuid}")
+end
+
 #    _
 #   |_ |\ | \  /
 #   |_ | \|  \/
@@ -322,6 +353,10 @@ def lttng_session_uuid
   Digest::MD5.hexdigest(lttng_trace_dir_tmp)
 end
 
+def lttng_blocking_channel_name()
+  'blocking-channel'
+end
+
 def sampling?
   return false unless OPTIONS[:sample]
 
@@ -429,7 +464,8 @@ def launch_usr_bin(env, cmd)
   end
 end
 
-def enable_events_ze(channel_name, tracing_mode: 'default', profiling: true)
+def enable_events_ze(channel_name, tracing_mode: 'default', profiling: true,
+                     trace_from_start: true)
   lttng_enable = "lttng enable-event --userspace --session=#{lttng_session_uuid} --channel=#{channel_name}"
   case tracing_mode
   when 'minimal'
@@ -464,7 +500,23 @@ def enable_events_ze(channel_name, tracing_mode: 'default', profiling: true)
   end
 end
 
-def enable_events_cl(channel_name, tracing_mode: 'default', profiling: true)
+def enable_events_cl(channel_name, tracing_mode: 'default', profiling: true,
+                     trace_from_start: true)
+  exec_thapi_ctl("init", ["opencl"], tracing_mode, trace_from_start, profiling)
+  if trace_from_start
+    exec_thapi_ctl("start", ["opencl"], tracing_mode, trace_from_start, profiling)
+  end
+  return
+
+  # lttng_enable_bookkeeping = "lttng enable-event --userspace --session=#{lttng_session_uuid_bookkeeping} --channel=#{channel_name}"
+  # bookkeeping_events = [
+  #   "lttng_ust_opencl:clGetDeviceIDs",
+  #   "lttng_ust_opencl:clCreatSubDevices",
+  #   "lttng_ust_opencl:clCreateCommandQueue",
+  #   "lttng_ust_opencl_arguments:kernel_info",
+  # ]
+  # exec("#{lttng_enable_bookkeeping} #{bookkeeping_events.join(',')}")
+  #
   lttng_enable = "lttng enable-event --userspace --session=#{lttng_session_uuid} --channel=#{channel_name}"
   case tracing_mode
   when 'full'
@@ -494,24 +546,92 @@ def enable_events_cl(channel_name, tracing_mode: 'default', profiling: true)
   end
 end
 
-def enable_events_cuda(channel_name, tracing_mode: 'default', profiling: true)
+def enable_events_cuda(channel_name, tracing_mode: 'default', profiling: true,
+                       trace_from_start: true)
+  exec_thapi_ctl("init", ["cuda"], tracing_mode, trace_from_start, profiling)
+  if trace_from_start
+    exec_thapi_ctl("start", ["cuda"], tracing_mode, trace_from_start, profiling)
+  end
+  # exec("lttng list --channel #{channel_name} --userspace #{lttng_session_uuid}")
+  return
+
+
+  # lttng_enable_bookkeeping = "lttng enable-event --userspace --session=#{lttng_session_uuid_bookkeeping} --channel=#{channel_name}"
+  # required_events = [
+  #   "lttng_ust_cuda:cuGetProcAddress_v2_entry",
+  #   "lttng_ust_cuda:cuGetProcAddress_v2_exit",
+  # ].join(',')
+  #   #"lttng_ust_cuda:cuGetExportTable_entry",
+  #   #"lttng_ust_cuda:cuGetExportTable_exit"
+  #   #"lttng_ust_cuda:cuGetProcAddress_v2_entry",
+  #   #"lttng_ust_cuda:cuGetProcAddress_v2_exit",
+  #   #"lttng_ust_cuda:cuDeviceGetCount_entry",
+  #   #"lttng_ust_cuda:cuDeviceGetCount_exit",
+  #   #"lttng_ust_cuda:cuDeviceGet_entry",
+  #   #"lttng_ust_cuda:cuDeviceGet_exit",
+  #   #"lttng_ust_cuda:cuDeviceGetName_entry",
+  #   #"lttng_ust_cuda:cuDeviceGetName_exit",
+  # bookkeeping_events = [
+  #   "lttng_ust_cuda:cuCtxCreate_entry",
+  #   "lttng_ust_cuda:cuCtxCreate_exit",
+  #   "lttng_ust_cuda:cuCtxCreate_v2_entry",
+  #   "lttng_ust_cuda:cuCtxCreate_v2_exit",
+  #   "lttng_ust_cuda:cuCtxCreate_v3_entry",
+  #   "lttng_ust_cuda:cuCtxCreate_v3_exit",
+  #   "lttng_ust_cuda:cuCtxDestroy_entry",
+  #   "lttng_ust_cuda:cuCtxDestroy_exit",
+  #   "lttng_ust_cuda:cuCtxDestroy_v2_entry",
+  #   "lttng_ust_cuda:cuCtxDestroy_v2_exit",
+  #   "lttng_ust_cuda:cuCtxSetCurrent_entry",
+  #   "lttng_ust_cuda:cuCtxSetCurrent_exit",
+  #   "lttng_ust_cuda:cuCtxPushCurrent_entry",
+  #   "lttng_ust_cuda:cuCtxPushCurrent_exit",
+  #   "lttng_ust_cuda:cuCtxPushCurrent_v2_entry",
+  #   "lttng_ust_cuda:cuCtxPushCurrent_v2_exit",
+  #   "lttng_ust_cuda:cuCtxPopCurrent_entry",
+  #   "lttng_ust_cuda:cuCtxPopCurrent_exit",
+  #   "lttng_ust_cuda:cuCtxPopCurrent_v2_entry",
+  #   "lttng_ust_cuda:cuCtxPopCurrent_v2_exit",
+  #   "lttng_ust_cuda:cuDevicePrimaryCtxRetain_entry",
+  #   "lttng_ust_cuda:cuDevicePrimaryCtxRetain_exit",
+  #   "lttng_ust_cuda:cuStreamCreate_entry",
+  #   "lttng_ust_cuda:cuStreamCreate_exit",
+  #   "lttng_ust_cuda:cuStreamCreateWithPriority_entry",
+  #   "lttng_ust_cuda:cuStreamCreateWithPriority_exit",
+  #   "lttng_ust_cuda:cuModuleGetFunction_entry",
+  #   "lttng_ust_cuda:cuModuleGetFunction_exit"
+  # ].join(',')
+  # exec("#{lttng_enable_bookkeeping} #{bookkeeping_events},#{required_events}")
+  #
   lttng_enable = "lttng enable-event --userspace --session=#{lttng_session_uuid} --channel=#{channel_name}"
   exec("#{lttng_enable} lttng_ust_cuda:*")
+  exec("#{lttng_enable} #{required_events}")
   exec("#{lttng_enable} lttng_ust_cuda_properties")
   exec("#{lttng_enable} lttng_ust_cuda_profiling:*") if profiling
 end
 
-def enable_events_hip(channel_name, tracing_mode: 'default', profiling: true)
-  lttng_enable = "lttng enable-event --userspace --session=#{lttng_session_uuid} --channel=#{channel_name}"
-  exec("#{lttng_enable} lttng_ust_hip:*")
+def enable_events_hip(channel_name, tracing_mode: 'default', profiling: true,
+                      trace_from_start: true)
+  # lttng_enable = "lttng enable-event --userspace --session=#{lttng_session_uuid} --channel=#{channel_name}"
+  # exec("#{lttng_enable} lttng_ust_hip:*")
+  exec_thapi_ctl("init", ["hip"], tracing_mode, trace_from_start, profiling)
+  if trace_from_start
+    exec_thapi_ctl("start", ["hip"], tracing_mode, trace_from_start, profiling)
+  end
 end
 
-def enable_events_omp(channel_name, tracing_mode: 'default', profiling: true)
-  lttng_enable = "lttng enable-event --userspace --session=#{lttng_session_uuid} --channel=#{channel_name}"
-  exec("#{lttng_enable} lttng_ust_ompt:*target*")
+def enable_events_omp(channel_name, tracing_mode: 'default', profiling: true,
+                      trace_from_start: true)
+  # lttng_enable = "lttng enable-event --userspace --session=#{lttng_session_uuid} --channel=#{channel_name}"
+  # exec("#{lttng_enable} lttng_ust_ompt:*target*")
+  exec_thapi_ctl("init", ["omp"], tracing_mode, trace_from_start, profiling)
+  if trace_from_start
+    exec_thapi_ctl("start", ["omp"], tracing_mode, trace_from_start, profiling)
+  end
 end
 
-def enable_events_metadata(channel_name, tracing_mode: 'default', profiling: true)
+def enable_events_metadata(channel_name, tracing_mode: 'default', profiling: true,
+                           trace_from_start: true)
   lttng_enable = "lttng enable-event --userspace --session=#{lttng_session_uuid} --channel=#{channel_name}"
   exec("#{lttng_enable} lttng_ust_thapi:*")
 end
@@ -528,7 +648,8 @@ def setup_lttng(backends)
 
   File.write(File.join(lttng_trace_dir_tmp, 'thapi_metadata.yaml'), { type: 'lttng' }.to_yaml)
 
-  channel_name = 'blocking-channel'
+  channel_name = lttng_blocking_channel_name
+
   exec("lttng enable-channel --userspace --session=#{lttng_session_uuid} --blocking-timeout=inf #{channel_name}")
   exec("lttng add-context    --userspace --session=#{lttng_session_uuid} --channel=#{channel_name} -t vpid -t vtid")
 
@@ -536,7 +657,8 @@ def setup_lttng(backends)
   (backends + ['metadata']).each do |name|
     send("enable_events_#{name}", channel_name,
          tracing_mode: OPTIONS[:'tracing-mode'],
-         profiling: OPTIONS[:profile])
+         profiling: OPTIONS[:profile],
+         trace_from_start: OPTIONS[:'trace-from-start'])
   end
 
   if OPTIONS.include?(:sample)
@@ -626,15 +748,17 @@ def on_node_processing_sync_and_rename(syncd)
 end
 
 # Start, Stop lttng, amd do the on-node analsysis
-def trace_and_on_node_processing(usr_argv)
+def trace_and_on_node_processing(usr_argv, options)
   # All masters set a future global barrier
   Sync_daemon.open do |syncd|
     # Load Tracers and APILoaders Lib
     backends, h = env_tracers
 
-    # All ranks need to set the LLTTNG_HOME env
-    # so they can have access to the daemon
-    ENV['LTTNG_HOME'] = lttng_home_dir
+    # TODO: set in one place, or pass per command?
+    ENV.merge!(thapi_env(backends, options[:'tracing-mode'],
+                         options[:'trace-from-start'],
+                         options[:'profile']))
+
     # Only local master spawn LTTNG daemon and start session
     setup_lttng(backends) if mpi_local_master?
     syncd.local_barrier('waiting_for_lttng_setup')
@@ -744,6 +868,8 @@ if __FILE__ == $PROGRAM_NAME
   end
   parser.on('--[no-]profile', 'Trigger for device activities profiling', default: true)
   parser.on('--[no-]analysis', 'Trigger for analysis', default: true)
+  parser.on('--[no-]trace-from-start', 'Disable tracing at start (default enabled)',
+            default: true)
 
   # General Options
   parser.on('-b', '--backends BACKENDS', Array, "Select which and how backends' need to handled.",
@@ -819,7 +945,7 @@ if __FILE__ == $PROGRAM_NAME
 
   # Right now, `replay` means no tracing.
   # But we don't have a way of disabling post-processing
-  folder = OPTIONS.include?(:replay) ? OPTIONS[:replay] || last_trace_saved : trace_and_on_node_processing(ARGV)
+  folder = OPTIONS.include?(:replay) ? OPTIONS[:replay] || last_trace_saved : trace_and_on_node_processing(ARGV, options)
   if mpi_master?
     warn("THAPI: Trace location: #{folder}")
     global_processing(folder) if OPTIONS[:analysis]