diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml index 7380144d..5c69ec14 100644 --- a/.github/workflows/presubmit.yml +++ b/.github/workflows/presubmit.yml @@ -28,7 +28,8 @@ jobs: with: path: ~/babeltrace2/2.0.4 key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler + - run: sudo apt-add-repository ppa:lttng/stable-2.13 + - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler if: steps.babeltrace2.outputs.cache-hit != 'true' - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel if: steps.babeltrace2.outputs.cache-hit != 'true' @@ -69,7 +70,8 @@ jobs: with: path: ~/babeltrace2/2.0.4 key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind + - run: sudo apt-add-repository ppa:lttng/stable-2.13 + - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | @@ -112,7 +114,8 @@ jobs: with: path: ~/babeltrace2/2.0.4 key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler + - run: sudo apt-add-repository ppa:lttng/stable-2.13 + - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | @@ -146,9 +149,12 @@ jobs: include: - bats_file: 'iprof' thapi_sync_daemon: 'fs' # Need to force, if not `THAPI_SYNC_DAEMON` will be set to empty list + - bats_file: 'libthapi-ctl' + thapi_sync_daemon: 'fs' env: THAPI_TEST_BIN: clinfo - THAPI_BIN_DIR: ./build/ici/bin + THAPI_BIN_DIR: ${{ github.workspace }}/build/ici/bin + THAPI_INSTALL_DIR: ${{ github.workspace }}/build/ici THAPI_SYNC_DAEMON: ${{ matrix.thapi_sync_daemon }} steps: - uses: actions/checkout@v4 @@ -165,12 +171,16 @@ jobs: name: thapi-bin - name: Untar THAPI run: tar -xvf thapi.tar - - run: sudo apt update; sudo apt install -y lttng-tools liblttng-ust-dev ruby ruby-dev libprotobuf-dev libpocl2 clinfo bats coreutils + - run: sudo apt-add-repository ppa:lttng/stable-2.13 + - run: sudo apt update; sudo apt install -y lttng-tools liblttng-ust-dev ruby ruby-dev libprotobuf-dev libpocl2 clinfo bats coreutils liblttng-ctl-dev ocl-icd-opencl-dev jq - run: sudo gem install babeltrace2 opencl_ruby_ffi - name: Load Babeltrace2 run: | echo "$HOME/babeltrace2/2.0.4/bin" >> $GITHUB_PATH echo "LD_LIBRARY_PATH=$HOME/babeltrace2/2.0.4/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV + - name: ldconfig + run: | + ldconfig -p | grep libOpenCL - name: integration test ${{ matrix.bats_file }} run: | bats integration_tests/${{ matrix.bats_file }}.bats @@ -189,7 +199,8 @@ jobs: with: path: ~/babeltrace2/2.0.4 key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind + - run: sudo apt-add-repository ppa:lttng/stable-2.13 + - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | @@ -227,7 +238,8 @@ jobs: with: path: ~/babeltrace2/2.0.4 key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind + - run: sudo apt-add-repository ppa:lttng/stable-2.13 + - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | @@ -259,7 +271,8 @@ jobs: with: path: ~/babeltrace2/2.0.4 key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind + - run: sudo apt-add-repository ppa:lttng/stable-2.13 + - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | diff --git a/Makefile.am b/Makefile.am index 5b9ba8f1..bc244edb 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,4 +1,4 @@ ACLOCAL_AMFLAGS = -I m4 -SUBDIRS = utils xprof sampling opencl ze cuda omp hip +SUBDIRS = utils xprof sampling opencl ze cuda omp hip libthapictl EXTRA_DIST = autogen.sh README.md .valgrind/dlopen.supp diff --git a/configure.ac b/configure.ac index e8d3b8a2..e5832d4d 100644 --- a/configure.ac +++ b/configure.ac @@ -72,6 +72,7 @@ AM_CONDITIONAL([FOUND_MPI], [test "x$found_mpi" = xyes]) PKG_CHECK_MODULES([LIBFFI], [libffi >= 3.2]) PKG_CHECK_MODULES([BABELTRACE2], [babeltrace2 >= 2.0]) PKG_CHECK_MODULES([LTTNG_UST], [lttng-ust >= 2.10]) +PKG_CHECK_MODULES([LTTNG_CTL], [lttng-ctl >= 2.13.13]) PKG_CHECK_MODULES([PROTOBUF], [protobuf >= 3.0]) AX_RUBY_EXTENSION([cast-to-yaml], [yes]) @@ -114,6 +115,7 @@ AC_CONFIG_FILES([ hip/Makefile xprof/Makefile utils/Makefile + libthapictl/Makefile ]) AC_CONFIG_FILES([utils/test_wrapper_thapi_text_pretty.sh], [chmod +x utils/test_wrapper_thapi_text_pretty.sh]) AC_CONFIG_FILES([opencl/tracer_opencl.sh], [chmod +x opencl/tracer_opencl.sh]) diff --git a/cuda/cuda_model.rb b/cuda/cuda_model.rb index 0d49bf1d..18f96c7d 100644 --- a/cuda/cuda_model.rb +++ b/cuda/cuda_model.rb @@ -307,7 +307,7 @@ def upper_snake_case(str) fullname = "#{name}#{versions.size - i > 1 ? "_v#{versions.size - i}" : ""}#{suffix}" fullname = "#{name}_v2#{suffix}" unless command_names.include?(fullname) sstr = < 1 ? "&& #{suffix ? "pt_condition" : "normal_condition" } " : ""}&& cudaVersion >= #{version} && strcmp(symbol, "#{name}") == 0) { + if ((!_trace_from_start() || tracepoint_enabled(lttng_ust_cuda, #{fullname}_#{START})) #{suffixes.size > 1 ? "&& #{suffix ? "pt_condition" : "normal_condition" } " : ""}&& cudaVersion >= #{version} && strcmp(symbol, "#{name}") == 0) { wrap_#{fullname}(pfn); } EOF diff --git a/cuda/tracer_cuda_helpers.include.c b/cuda/tracer_cuda_helpers.include.c index c18d1bac..0df77acb 100644 --- a/cuda/tracer_cuda_helpers.include.c +++ b/cuda/tracer_cuda_helpers.include.c @@ -483,6 +483,15 @@ static inline void _primary_context_reset(CUdevice dev) { pthread_mutex_unlock(&_primary_contexts_mutex); } +static int _trace_from_start() { + static int trace_from_start = -1; + if (trace_from_start == -1) { + const char *env_value = getenv("LTTNG_UST_TRACE_FROM_START"); + trace_from_start = (env_value == NULL || strcmp(env_value, "1") == 0); + } + return trace_from_start; +} + static void THAPI_ATTRIBUTE_DESTRUCTOR _lib_cleanup() { if (_do_cleanup) { diff --git a/integration_tests/gtensor.bats b/integration_tests/gtensor.bats new file mode 100644 index 00000000..89fda79f --- /dev/null +++ b/integration_tests/gtensor.bats @@ -0,0 +1,37 @@ +#!/usr/bin/env bats + +setup_file() { + export IPROF=$THAPI_INSTALL_DIR/bin/iprof + + bdir="build-$GTENSOR_DEVICE" + + (cd integration_tests/gtensor; + cmake -S . -B $bdir -DGTENSOR_DEVICE=$GTENSOR_DEVICE \ + -DTHAPI_PATH=$THAPI_INSTALL_DIR \ + && cmake --build $bdir) + + export THAPI_TEST_BIN=$(pwd)/integration_tests/gtensor/$bdir/axpy_start_stop + export LD_LIBRARY_PATH=$THAPI_INSTALL_DIR/lib:$LD_LIBRARY_PATH +} + +teardown_file() { + rm -rf $THAPI_HOME/thapi-traces +} + +@test "axpy-${GTENSOR_DEVICE}-trace-from-start" { + $IPROF --debug 0 $THAPI_TEST_BIN $TEST_ARGS + # NOTE: gtensor kernel names are complex, but all start with gt::, look it up rather + # than hard coding and making it fragile. Includes quotes. + kernel_name=$($IPROF -r -j | jq '.device.data | keys[]' | grep 'gt::') + write_count=$($IPROF -r -j | jq ".device.data | .[${kernel_name}].call") + # 0th call, plus 5 odd calls + [ $write_count -eq 6 ] +} + +@test "axpy-${GTENSOR_DEVICE}-no-trace-from-start" { + $IPROF --no-trace-from-start --debug 0 $THAPI_TEST_BIN $TEST_ARGS + kernel_name=$($IPROF -r -j | jq '.device.data | keys[]' | grep 'gt::') + write_count=$($IPROF -r -j | jq ".device.data | .[${kernel_name}].call") + # 5 odd calls, does not include 0 since profiling should be disabled at start + [ $write_count -eq 5 ] +} diff --git a/integration_tests/gtensor/CMakeLists.txt b/integration_tests/gtensor/CMakeLists.txt new file mode 100644 index 00000000..519b899c --- /dev/null +++ b/integration_tests/gtensor/CMakeLists.txt @@ -0,0 +1,38 @@ +cmake_minimum_required(VERSION 3.23 FATAL_ERROR) + +project(thapi-integration-test) + +if(GTENSOR_DEVICE STREQUAL "cuda") + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) +endif() + +option(USE_THAPI_CTL "use thapi-ctl to start/stop tracing" ON) +set(THAPI_PATH "/opt/thapi" CACHE STRING "Path to thapi installation") + +message(STATUS "${PROJECT_NAME}: THAPI_PATH=${THAPI_PATH}") + +# add dependencies +include(cmake/CPM.cmake) +CPMFindPackage(NAME gtensor + GITHUB_REPOSITORY wdmapp/gtensor + GIT_TAG "main" + OPTIONS "GTENSOR_ENABLE_BLAS ON" + "GTENSOR_ENABLE_SOLVER ON") + +add_executable(axpy_start_stop) +target_gtensor_sources(axpy_start_stop PRIVATE axpy_start_stop.cxx) +target_link_libraries(axpy_start_stop gtensor::gtensor) + +if (${USE_THAPI_CTL}) + find_package(PkgConfig REQUIRED) + pkg_check_modules(LTTNG_CTL REQUIRED lttng-ctl) + + add_library(thapi_ctl INTERFACE IMPORTED) + target_include_directories(thapi_ctl INTERFACE "${THAPI_PATH}/include") + target_link_libraries(thapi_ctl INTERFACE "${THAPI_PATH}/lib/libthapi-ctl.so") + target_link_libraries(thapi_ctl INTERFACE ${LTTNG_CTL_LDFLAGS}) + target_link_libraries(axpy_start_stop thapi_ctl) +else() + target_compile_definitions(axpy_start_stop PRIVATE NO_THAPI_CTL) +endif() diff --git a/integration_tests/gtensor/README.md b/integration_tests/gtensor/README.md new file mode 100644 index 00000000..c849612c --- /dev/null +++ b/integration_tests/gtensor/README.md @@ -0,0 +1,13 @@ +# gtensor integration tests + +Allows testing cuda, hip, and ze backends with single source file. + +# Usage + +``` +GTENSOR_DEVICE=cuda bats ./integration_tests/gtensor.bats +``` +Or "hip" or "sycl" + +Note that THAPI hip backend does not yet do device profiling, modifications will need to be +made for tests to pass on hip. diff --git a/integration_tests/gtensor/axpy_start_stop.cxx b/integration_tests/gtensor/axpy_start_stop.cxx new file mode 100644 index 00000000..0eb60fa9 --- /dev/null +++ b/integration_tests/gtensor/axpy_start_stop.cxx @@ -0,0 +1,105 @@ + +#include + +#include + +#ifndef NO_THAPI_CTL +#include "thapi-ctl.h" +#endif + +// provides convenient shortcuts for common gtensor functions, for example +// underscore ('_') to represent open slice ends. +using namespace gt::placeholders; + +/** + * Templated function for daxpy that can work with host gtensor or device + * gtensor. Relies on C++11 mandatory copy elision to move the result into the + * LHS it is assigned to, rather than copying all the data (very important for + * large arrays). Input arrays are passed by reference, again to avoid copying + * all the data which would happen by default (just like std::vector, gtensor + * has copy semantics by default). + */ +template +auto daxpy(double a, const Ex& x, const Ey& y) +{ + // The expression 'a * x + y' generates a gfunction template, which is an + // un-evaluated expression. In this case, we force evaluation in the return + // statementso a new gtensor is created. It can be useful to not add the + // `gt::eval`, in which case the un-evaluated expression will be returned and + // can be combined with other operations before being evaluated, or assigned + // to a gtensor to force immediate evaluation. + return gt::eval(a * x + y); +} + +int main(int argc, char** argv) +{ + int n = 1024 * 1024; + int nprint = 32; + + double a = 0.5; + + // Define and allocate two 1d vectors of size n on the host. Declare + // but don't allocate a third 1d host vector for storing the result. + gt::gtensor h_x(gt::shape(n)); + gt::gtensor h_y = gt::empty_like(h_x); + gt::gtensor h_axpy; + + // initialize the vectors, x is twice it's index values and y is equal + // to it's index values. We will perform .5*x + y, so the result should be + // axpy(i) = 2i. + for (int i = 0; i < n; i++) { + h_x(i) = 2.0 * static_cast(i); + h_y(i) = static_cast(i); + } + +#ifndef GTENSOR_HAVE_DEVICE +#error "device required" +#endif + + // Define and allocate device versions of h_x and h_y, and declare + // a varaible for the result on gpu. + gt::gtensor d_x(gt::shape(n)); + gt::gtensor d_y = gt::empty_like(d_x); + gt::gtensor d_axpy; + + // Explicit copies of input from host to device. Note that this is an + // overload of the copy function for gtensor and gtensor_span types, not + // std::copy which has a different signature. The source is the first + // argument and destination the second argument. Currently thrust::copy is + // used under the hood in the implementation. + gt::copy(h_x, d_x); + gt::copy(h_y, d_y); + + // This automatically generates a computation kernel to run on the + // device. + auto expr = a * d_x + d_y; + + for (int i = 0; i < 10; i++) { +#ifndef NO_THAPI_CTL + if (i % 2 == 1) { + thapi_ctl_start(); + } +#endif + + d_axpy = gt::eval(expr); + h_axpy = gt::empty_like(h_x); + gt::copy(d_axpy, h_axpy); + gt::synchronize(); + + std::cout << "axpy(10) = " << h_axpy(10) << std::endl; + +#ifndef NO_THAPI_CTL + if (i % 2 == 1) { + thapi_ctl_stop(); + } +#endif + } + + // Define a slice to print a subset of elements for spot checking the + // result. + auto print_slice = gt::gslice(_, _, n / nprint); + std::cout << "a = " << a << std::endl; + std::cout << "x = " << h_x.view(print_slice) << std::endl; + std::cout << "y = " << h_y.view(print_slice) << std::endl; + std::cout << "a*x + y = " << h_axpy.view(print_slice) << std::endl; +} diff --git a/integration_tests/gtensor/cmake/CPM.cmake b/integration_tests/gtensor/cmake/CPM.cmake new file mode 100644 index 00000000..d0fd0e8e --- /dev/null +++ b/integration_tests/gtensor/cmake/CPM.cmake @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: MIT +# +# SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors + +set(CPM_DOWNLOAD_VERSION 0.39.0) +set(CPM_HASH_SUM "66639bcac9dd2907b2918de466783554c1334446b9874e90d38e3778d404c2ef") + +if(CPM_SOURCE_CACHE) + set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +elseif(DEFINED ENV{CPM_SOURCE_CACHE}) + set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +else() + set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +endif() + +# Expand relative path. This is important if the provided path contains a tilde (~) +get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE) + +file(DOWNLOAD + https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake + ${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM} +) + +include(${CPM_DOWNLOAD_LOCATION}) diff --git a/integration_tests/libthapi-ctl.bats b/integration_tests/libthapi-ctl.bats new file mode 100644 index 00000000..87f8ca3c --- /dev/null +++ b/integration_tests/libthapi-ctl.bats @@ -0,0 +1,64 @@ +#!/usr/bin/env bats + +setup_file() { + export IPROF=$THAPI_INSTALL_DIR/bin/iprof + + (cd integration_tests/libthapi-ctl; + #ld --verbose | grep SEARCH_DIR | tr -s ' ;' \\012; + #gcc -print-search-dirs | grep libraries | tr -s ' ;=:' \\012; + make clean && make -j; + #ldd hello-cl-startstop; + #objdump -p hello-cl-startstop; + ) + + export THAPI_TEST_BIN=$(pwd)/integration_tests/libthapi-ctl/hello-cl-startstop + export LD_LIBRARY_PATH=$THAPI_INSTALL_DIR/lib:$LD_LIBRARY_PATH + + # Note: the pocl version in Ubuntu 20.04 (old lts as of 2024) does not + # support modern CPUs and will fail kernel compilation (notably does not work + # on CI). To keep things simple, we do clEnqueueWriteBuffer only if pocl is the + # default opencl platform. + IS_POCL=$(clinfo -l | grep 'Platform #0' | grep -c "Portable Computing Language" || true) + + # echo "# IS_POCL=$IS_POCL" >&3 + + TEST_ARGS="" + if [ "$IS_POCL" = "0" ]; then + TEST_ARGS="0 0 1" + fi + export TEST_ARGS IS_POCL +} + +teardown_file() { + rm -rf $THAPI_HOME/thapi-traces +} + +@test "startstop-cl-trace-from-start" { + $IPROF --debug 0 $THAPI_TEST_BIN $TEST_ARGS + TRACE=$(ls -1t $THAPI_HOME/thapi-traces | head -n1) + write_count=$($IPROF -r -j | jq '.device.data.clEnqueueWriteBuffer.call') + # 0th call, plus 5 odd calls + [ $write_count -eq 6 ] + + # echo "# IS_POCL=$IS_POCL" >&3 + + if [ "$IS_POCL" = "0" ]; then + kernel_count=$($IPROF -r -j | jq '.device.data.hello_world.call') + [ $write_count -eq 6 ] + fi +} + +@test "startstop-cl-no-trace-from-start" { + $IPROF --no-trace-from-start --debug 0 $THAPI_TEST_BIN $TEST_ARGS + write_count=$($IPROF -r -j | jq '.device.data.clEnqueueWriteBuffer.call') + # 5 odd calls, does not include 0 since profiling should be disabled at start + [ $write_count -eq 5 ] + + # echo "# IS_POCL=$IS_POCL" >&3 + + if [ "$IS_POCL" = "0" ]; then + kernel_count=$($IPROF -r -j | jq '.device.data.hello_world.call') + [ $write_count -eq 5 ] + # echo "# IS_POCL=$IS_POCL write_count=$write_count" >&3 + fi +} diff --git a/integration_tests/libthapi-ctl/Makefile b/integration_tests/libthapi-ctl/Makefile new file mode 100644 index 00000000..0b1e437c --- /dev/null +++ b/integration_tests/libthapi-ctl/Makefile @@ -0,0 +1,13 @@ +CFLAGS = -g -Wall -Wextra $(WERROR) -DCL_TARGET_OPENCL_VERSION=220 -DCL_USE_DEPRECATED_OPENCL_1_2_APIS -I$(THAPI_INSTALL_DIR)/include +LDFLAGS = /usr/lib/x86_64-linux-gnu/libOpenCL.so -L$(THAPI_INSTALL_DIR)/lib -lthapi-ctl -llttng-ctl + + +.PHONY: all +all: hello-cl-startstop + +hello-cl-startstop: hello-cl-startstop.c cl_utils.c + gcc $(CFLAGS) $^ $(LDFLAGS) -o $@ + +.PHONY: clean +clean: + rm -f hello-cl-startstop diff --git a/integration_tests/libthapi-ctl/cl_utils.c b/integration_tests/libthapi-ctl/cl_utils.c new file mode 100644 index 00000000..548392b3 --- /dev/null +++ b/integration_tests/libthapi-ctl/cl_utils.c @@ -0,0 +1,163 @@ +#include +#include +#include "./cl_utils.h" + +// _ +// |_) _ | _ ._ ._ | _. _|_ _ +// |_) (_) | (/_ | |_) | (_| |_ (/_ +// | + +/* - - - - +IO +- - - - */ +int read_from_binary(unsigned char **output, size_t *size, const char *name) { + FILE *fp = fopen(name, "rb"); + if (!fp) { + return -1; + } + + fseek(fp, 0, SEEK_END); + *size = ftell(fp); + fseek(fp, 0, SEEK_SET); + + *output = (unsigned char *)malloc(*size * sizeof(unsigned char)); + if (!*output) { + fclose(fp); + return -1; + } + + size_t ret = fread(*output, *size, 1, fp); + fclose(fp); + if (ret != *size) + return -1; + + return 0; +} + +int write_to_binary(unsigned char *input, size_t size, const char *name) { + FILE *fp = fopen(name, "wb"); + if (!fp) { + return -1; + } + fwrite(input, 1, size, fp); + fclose(fp); + return 0; +} + +char *read_from_file(const char *filename) +{ + size_t size = 0; + FILE *file = fopen(filename, "r"); + + if(!file) { + fputs("File error.\n", stderr); + return NULL; + } + + fseek(file, 0, SEEK_END); + size = ftell(file); + rewind(file); + + char *result = (char *) malloc(size + 1); + result[size] = '\0'; + if(!result) { + fputs("Memory error.\n", stderr); + return NULL; + } + + if(fread(result, 1, size, file) != size) { + fputs("Read error.\n", stderr); + return NULL; + } + + fclose(file); + return result; +} +/* - - - - +OpenCL Error +- - - - */ + +const char *getErrorString(cl_int error) +{ +switch(error){ + // run-time and JIT compiler errors + case 0: return "CL_SUCCESS"; + case -1: return "CL_DEVICE_NOT_FOUND"; + case -2: return "CL_DEVICE_NOT_AVAILABLE"; + case -3: return "CL_COMPILER_NOT_AVAILABLE"; + case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case -5: return "CL_OUT_OF_RESOURCES"; + case -6: return "CL_OUT_OF_HOST_MEMORY"; + case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE"; + case -8: return "CL_MEM_COPY_OVERLAP"; + case -9: return "CL_IMAGE_FORMAT_MISMATCH"; + case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; + case -11: return "CL_BUILD_PROGRAM_FAILURE"; + case -12: return "CL_MAP_FAILURE"; + case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; + case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; + case -15: return "CL_COMPILE_PROGRAM_FAILURE"; + case -16: return "CL_LINKER_NOT_AVAILABLE"; + case -17: return "CL_LINK_PROGRAM_FAILURE"; + case -18: return "CL_DEVICE_PARTITION_FAILED"; + case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; + + // compile-time errors + case -30: return "CL_INVALID_VALUE"; + case -31: return "CL_INVALID_DEVICE_TYPE"; + case -32: return "CL_INVALID_PLATFORM"; + case -33: return "CL_INVALID_DEVICE"; + case -34: return "CL_INVALID_CONTEXT"; + case -35: return "CL_INVALID_QUEUE_PROPERTIES"; + case -36: return "CL_INVALID_COMMAND_QUEUE"; + case -37: return "CL_INVALID_HOST_PTR"; + case -38: return "CL_INVALID_MEM_OBJECT"; + case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case -40: return "CL_INVALID_IMAGE_SIZE"; + case -41: return "CL_INVALID_SAMPLER"; + case -42: return "CL_INVALID_BINARY"; + case -43: return "CL_INVALID_BUILD_OPTIONS"; + case -44: return "CL_INVALID_PROGRAM"; + case -45: return "CL_INVALID_PROGRAM_EXECUTABLE"; + case -46: return "CL_INVALID_KERNEL_NAME"; + case -47: return "CL_INVALID_KERNEL_DEFINITION"; + case -48: return "CL_INVALID_KERNEL"; + case -49: return "CL_INVALID_ARG_INDEX"; + case -50: return "CL_INVALID_ARG_VALUE"; + case -51: return "CL_INVALID_ARG_SIZE"; + case -52: return "CL_INVALID_KERNEL_ARGS"; + case -53: return "CL_INVALID_WORK_DIMENSION"; + case -54: return "CL_INVALID_WORK_GROUP_SIZE"; + case -55: return "CL_INVALID_WORK_ITEM_SIZE"; + case -56: return "CL_INVALID_GLOBAL_OFFSET"; + case -57: return "CL_INVALID_EVENT_WAIT_LIST"; + case -58: return "CL_INVALID_EVENT"; + case -59: return "CL_INVALID_OPERATION"; + case -60: return "CL_INVALID_GL_OBJECT"; + case -61: return "CL_INVALID_BUFFER_SIZE"; + case -62: return "CL_INVALID_MIP_LEVEL"; + case -63: return "CL_INVALID_GLOBAL_WORK_SIZE"; + case -64: return "CL_INVALID_PROPERTY"; + case -65: return "CL_INVALID_IMAGE_DESCRIPTOR"; + case -66: return "CL_INVALID_COMPILER_OPTIONS"; + case -67: return "CL_INVALID_LINKER_OPTIONS"; + case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT"; + case -69: return "CL_INVALID_PIPE_SIZE"; + case -70: return "CL_INVALID_DEVICE_QUEUE"; + + default: return "Unknown OpenCL error"; + } +} + +void check_error(cl_int error, char const *name) { + if (error != CL_SUCCESS) { + fprintf(stderr, "Non-successful return code %d (%s) for %s. Exiting.\n", error, getErrorString(error), name); + exit(EXIT_FAILURE); + } +} + +void exit_msg(char const *str){ + fprintf(stderr, "%s \n", str); + exit(EXIT_FAILURE); +} + diff --git a/integration_tests/libthapi-ctl/cl_utils.h b/integration_tests/libthapi-ctl/cl_utils.h new file mode 100644 index 00000000..cac450fc --- /dev/null +++ b/integration_tests/libthapi-ctl/cl_utils.h @@ -0,0 +1,10 @@ +extern int read_from_binary(unsigned char **output, size_t *size, const char *name); +extern int write_to_binary(unsigned char *input, size_t size, const char *name); + +extern char *read_from_file(const char *filename); + +extern const char *getErrorString(cl_int error); + +extern void check_error(cl_int error, char const *name); + +extern void exit_msg(char const *str); diff --git a/integration_tests/libthapi-ctl/hello-cl-startstop.c b/integration_tests/libthapi-ctl/hello-cl-startstop.c new file mode 100644 index 00000000..2ad9ad1a --- /dev/null +++ b/integration_tests/libthapi-ctl/hello-cl-startstop.c @@ -0,0 +1,218 @@ +// Includes +#include +#include +#include +#include "./cl_utils.h" + +#include "thapi-ctl.h" + +#define NLOOP 10 + +int main(int argc, char* argv[]) { + + cl_int err; + + // _ _ _ + // |_) | _. _|_ _|_ _ ._ ._ _ () | \ _ o _ _ + // | | (_| |_ | (_) | | | | (_X |_/ (/_ \/ | (_ (/_ + // + printf(">>> Initializing OpenCL Platform and Device...\n"); + + cl_uint platform_idx = 0; + cl_uint device_idx = 0; + cl_bool run_kernel = CL_FALSE; + + if (argc > 1) + platform_idx = (cl_uint) atoi(argv[1]); + if (argc > 2) + device_idx = (cl_uint) atoi(argv[2]); + if (argc > 3) + run_kernel = CL_TRUE; + + char name[128]; + /* - - - + Plateform + - - - - */ + //A platform is a specific OpenCL implementation, for instance AMD, NVIDIA or Intel. + // Intel may have a different OpenCL implementation for the CPU and GPU. + + // Discover the number of platforms: + cl_uint platform_count; + err = clGetPlatformIDs(0, NULL, &platform_count); + check_error(err, "clGetPlatformIds"); + + // Now ask OpenCL for the platform IDs: + cl_platform_id* platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * platform_count); + err = clGetPlatformIDs(platform_count, platforms, NULL); + check_error(err, "clGetPlatformIds"); + + cl_platform_id platform = platforms[platform_idx]; + err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, 128, name, NULL); + check_error(err, "clGetPlatformInfo"); + + printf("Platform #%d: %s\n", platform_idx, name); + + /* - - - - + Device + - - - - */ + // Device gather data + cl_uint device_count; + err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL , 0, NULL, &device_count); + check_error(err, "clGetdeviceIds"); + + cl_device_id* devices = (cl_device_id*)malloc(sizeof(cl_device_id) * device_count); + err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL , device_count, devices, NULL); + check_error(err, "clGetdeviceIds"); + + cl_device_id device = devices[device_idx]; + err = clGetDeviceInfo(device, CL_DEVICE_NAME, 128, name, NULL); + check_error(err, "clGetPlatformInfo"); + + printf("-- Device #%d: %s\n", device_idx, name); + + /* - - - - + Context + - - - - */ + // A context is a platform with a set of available devices for that platform. + cl_context context = clCreateContext(0, device_count, devices, NULL, NULL, &err); + check_error(err,"clCreateContext"); + + const char *kernelstring = + "__kernel void hello_world() {" + " const int world_rank = get_global_id(0);" + " printf(\"Hello world from rank %d \\n\", world_rank);" + "}"; + + /* - - - - + Command queue + - - - - */ + // The OpenCL functions that are submitted to a command-queue are enqueued in the order the calls are made but can be configured to execute in-order or out-of-order. + cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err); + check_error(err,"clCreateCommandQueue"); + + // create a device buffer + size_t a_size = 100; + size_t batch_size = a_size / NLOOP; + size_t offset = 0; + size_t a_bytes = a_size * sizeof(float); + float *h_a = (float*)malloc(a_bytes); + for (unsigned int i = 0; i < a_size; i++) { + h_a[i] = (float)i; + } + cl_mem d_a = clCreateBuffer(context, CL_MEM_READ_WRITE, a_bytes, NULL, &err); + check_error(err,"cclCreateBuffer"); + + // define program and kernel variables, used if run_kernel was requested in + // 3rd arg + cl_program program; + cl_kernel kernel; +#define WORK_DIM 1 + + // Describe the number of global work-items in work_dim dimensions that will execute the kernel function + const size_t global[WORK_DIM] = { 64 }; + + // Describe the number of work-items that make up a work-group (also referred to as the size of the work-group). + // local_work_size can also be a NULL value in which case the OpenCL implementation will determine how to be break the global work-items into appropriate work-group instances. + const size_t local[WORK_DIM] = { 32 }; + + if (run_kernel) { + // Create the program + program = clCreateProgramWithSource(context, 1, &kernelstring, NULL, &err); + check_error(err,"clCreateProgramWithSource"); + + //Build / Compile the program executable + err = clBuildProgram(program, device_count, devices, "-cl-unsafe-math-optimizations", NULL, NULL); + if (err != CL_SUCCESS) + { + printf("Error: Failed to build program executable!\n"); + + size_t logSize; + clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); + + // there's no information in the reference whether the string is 0 terminated or not. + char* messages = (char*)malloc((1+logSize)*sizeof(char)); + clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, messages, NULL); + messages[logSize] = '\0'; + + printf("%s", messages); + free(messages); + return EXIT_FAILURE; + } + + /* - - - - + Create + - - - - */ + kernel = clCreateKernel(program, "hello_world", &err); + check_error(err,"clCreateKernel"); + + /* - - - - + ND range + - - - - */ + printf(">>> NDrange configuration...\n"); + + printf("Global work size: %zu \n", global[0]); + printf("Local work size: %zu \n", local[0]); + + } + + /* - - - - + Execute + - - - - */ + for (unsigned int i = 0; i < NLOOP; i++) { + printf(">>> WriteBuffer Execution [%d]...\n", i); + + // trace only odd executions + if (i % 2 == 1) + thapi_ctl_start(); + + // (cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, size_t offset, size_t size, const void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); + err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, offset * sizeof(float), batch_size * sizeof(float), h_a, 0, NULL, NULL); + check_error(err,"clEnqueueWriteBuffer"); + + if (run_kernel) { + printf(">>> Kernel Execution [%d]...\n", i); + err = clEnqueueNDRangeKernel(queue, kernel, WORK_DIM, NULL, global, local, 0, NULL, NULL); + check_error(err,"clEnqueueNDRangeKernel"); + } + + /* - - - + Sync & check + - - - */ + + // Wait for the command queue to get serviced before reading back results + clFinish(queue); + + + /* - - - + Sync & check + - - - */ + + // Wait for the command queue to get serviced before reading back results + clFinish(queue); + + if (i % 2 == 1) + thapi_ctl_stop(); + + offset += batch_size; + } + + // + // / | _ _. ._ o ._ _ + // \_ | (/_ (_| | | | | | (_| + // _| + free(devices); + free(platforms); + clReleaseCommandQueue(queue); + clReleaseMemObject(d_a); + clReleaseContext(context); + + if (run_kernel) { + clReleaseProgram(program); + clReleaseKernel(kernel); + } + + // Exit + return 0; +} + +// ================================================================================================= diff --git a/integration_tests/libthapi-ctl/omp_target/Makefile b/integration_tests/libthapi-ctl/omp_target/Makefile new file mode 100644 index 00000000..d35e134b --- /dev/null +++ b/integration_tests/libthapi-ctl/omp_target/Makefile @@ -0,0 +1,15 @@ +CXXFLAGS = -g -Wall -Wextra $(WERROR) -I$(THAPI_INSTALL_DIR)/include +LDFLAGS = -L$(THAPI_INSTALL_DIR)/lib -lthapi-ctl -L$(LTTNG_INSTALL_DIR)/lib -llttng-ctl + +vec_cuda: vec.cpp + clang++ $(CXXFLAGS) $(LDFLAGS) -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda $^ -o $@ + +vec_intel: vec.cpp + icpx $(CXXFLAGS) $(LDFLAGS) -qopenmp -fopenmp-targets=spir64 $^ -o $@ + +vec_hip: vec.cpp + amdclang++ $(CXXFLAGS) $(LDFLAGS) -fopenmp --offload-arch=gfx1030 $^ -o $@ + +.PHONY: clean +clean: + rm -f vec_cuda vec_intel vec_hip diff --git a/integration_tests/libthapi-ctl/omp_target/vec.cpp b/integration_tests/libthapi-ctl/omp_target/vec.cpp new file mode 100644 index 00000000..d9e6841d --- /dev/null +++ b/integration_tests/libthapi-ctl/omp_target/vec.cpp @@ -0,0 +1,44 @@ +/* Copyright (c) 2019 CSC Training */ +/* Copyright (c) 2021 ENCCS */ +#include +#include + +#include "thapi-ctl.h" + +#define NX 102400 + +#define NITER 10 + +int main(void) +{ + double vecA[NX],vecB[NX],vecC[NX]; + double r=0.2; + +/* Initialization of vectors */ + for (int i = 0; i < NX; i++) { + vecA[i] = pow(r, i); + vecB[i] = 1.0; + } + + for (int n = 0; n < NITER; n++) { + if (n % 2 == 1) + thapi_ctl_start(); + + /* dot product of two vectors */ + #pragma omp target teams distribute parallel for + for (int i = 0; i < NX; i++) { + vecC[i] = vecA[i] * vecB[i]; + } + + if (n % 2 == 1) + thapi_ctl_stop(); + } + + double sum = 0.0; + /* calculate the sum */ + for (int i = 0; i < NX; i++) { + sum += vecC[i]; + } + printf("The sum is: %8.6f \n", sum); + return 0; +} diff --git a/libthapictl/Makefile.am b/libthapictl/Makefile.am new file mode 100644 index 00000000..b7cb93f4 --- /dev/null +++ b/libthapictl/Makefile.am @@ -0,0 +1,29 @@ +if STRICT + WERROR = -Werror +else + WERROR = +endif + +include_HEADERS = \ + include/thapi-ctl.h \ + include/backends.h + +lib_LTLIBRARIES = libthapi-ctl.la + +libthapi_ctl_la_SOURCES = src/startstop.c src/backends.c +libthapi_ctl_la_CFLAGS = -Wall -Wextra $(WERROR) -I$(srcdir)/include -I$(builddir) -I$(top_srcdir)/utils/include $(LTTNG_CTL_CFLAGS) +libthapi_ctl_la_LDFLAGS = $(LTTNG_CTL_LIBS) + +bin_PROGRAMS = thapi-ctl + +thapi_ctl_SOURCES = src/main.c +thapi_ctl_CFLAGS = -Wall -Wextra $(WERROR) -I$(srcdir)/include -I$(builddir) -I$(top_srcdir)/utils/include $(LTTNG_CTL_CFLAGS) +thapi_ctl_LDFLAGS = -L$(builddir) -lthapi-ctl $(LTTNG_CTL_LIBS) +thapi_ctl_DEPENDENCIES = libthapi-ctl.la + +check_PROGRAMS = test-thapi-ctl +test_thapi_ctl_SOURCES = tests/test-thapi-ctl.c +test_thapi_ctl_CFLAGS = -Wall -Wextra $(WERROR) -I$(srcdir)/include $(LTTNG_CTL_CFLAGS) +test_thapi_ctl_LDFLAGS = -L$(builddir) -Wl,-rpath,$(builddir) -lthapi-ctl $(LTTNG_CTL_LIBS) + +TESTS = $(check_PROGRAMS) diff --git a/libthapictl/README.md b/libthapictl/README.md new file mode 100644 index 00000000..967f25ae --- /dev/null +++ b/libthapictl/README.md @@ -0,0 +1,75 @@ +# libthapictl + +This library is desiged to allow applications to control event collection, similar to +how cudaProfilerStart and cudaProfilerStop work. + +## Design Considerations + +The simplest design would be to stop and start the entire lttng session. However many +backends use complex chains of API calls to figure out the device context of things like +memory operations and kernel launches on device. Collection of events for these API +calls must never be disabled or thapi will be unable to correctly identify future +device operations. We call these API calls "bookkeeping" calls. + +In addition to "bookkeeping" events, there are additional categories for backends that +support "minimal", "default", and/or "full" tracing modes, and device profiling can also +be enabled or disabled independently. This creates further categories. + +### Design 1: two sessions + +The initial idea was to create two lttng sessions, one that would collect all the +"bookkeeping" events and one that would collect everything else (the "main" session). Then +thapi_ctl_start/stop could simply start/stop the main session, and always leave the +bookkeeping session running. This worked well enough for CUDA, except when the +"--no-tracing-from-start" option was used. This option causes start to not be run +unless the user calls it, so during cuInit, only the "bookkeeping" session is active. + +The CUDA runtime in particular has special handling for dynamic symbols, using the +cuGetProcAddress_v2 API call as the hook to wrap dynamically loaded symbols. This works +fine if both bookkeeping and main sessions are running when cuGetProcAddress loads all +the symbols (during cuInit, triggered by first CUDA runtime call), and both are configured +to intercept the entry/exit events for that call. However if main is not started, then +it never gets to wrap the dynamic symbols, and cuGetProcAddress is not called again. + +OpenCL does not have this complication, and works well with this method. ZE may be similarly complex to CUDA, and HIP may also be once we add device profiling. + +We could not support "--no-tracing-from-start", and make it the user's responsibility to +call thapi_ctl_stop in the beginning of the program but AFTER cuInit and similar, but this +is fragile and hard to document. Another possiblility would be to try to force cuInit +right after tracer init time, but this is also a very invasive thing for a tracing +framework to do. For this reason, we came up with another design: + +### Design 2: fine grained event enable/disable + +Rather than put "bookkeeping" in a different category, we can have libthapictl know which +events are bookkeeping and which events to enable/disable for start stop. To avoid +duplicating this knowledge in xprof ruby code AND in libthapictl, all this knowledge +can be moved into libthapictl, and a "thapi-ctl" binary that is a thin wrapper around +the library start/stop can be called from xprof. + +There are three commands: + +init: enable bookkeeping events (always called) +start: enable tracing events, according to options requested by user (tracing mode, device profiling on/off, which backends are enabled) +stop: disable tracing events + +By default tracing from start is enabled, so both init and start are called for each enabled +backend. Then the profiled application can link libthapi-ctl and call thapi_ctl_stop/start +as desired. + +This approach is more complex than (1), but it is far more likely to work for all backends +without gotchas for the user like call stop before init breaking everything. + +## Environment variables + +Things like the LTTNG_HOME, session name, and channel name, enabled backends, and tracing +mode are passed from xprof to thapi-ctl and libthapictl via environment variables, which +are memoized with static C functions / static function vars. + +## Testing + +This is a very invasive change to THAPI; in addition to testing the new functionality of +starting/stopping tracing, we need to test that existing functionality like tracing modes +is not broken. To this end new integration tests are being added, that can't be run on +CI because of hardware requirements but should be easy to run manually on machines with +appropriate hardware. See libthapictl and gtensor integration test directories. diff --git a/libthapictl/include/backends.h b/libthapictl/include/backends.h new file mode 100644 index 00000000..2a69e109 --- /dev/null +++ b/libthapictl/include/backends.h @@ -0,0 +1,19 @@ +#pragma once + +#include + +void thapi_cuda_init(struct lttng_handle *h, const char *channel_name); +void thapi_cuda_enable_tracing_events(struct lttng_handle *h, const char *channel_name); +void thapi_cuda_disable_tracing_events(struct lttng_handle *h, const char *channel_name); + +void thapi_opencl_init(struct lttng_handle *h, const char *channel_name); +void thapi_opencl_enable_tracing_events(struct lttng_handle *h, const char *channel_name); +void thapi_opencl_disable_tracing_events(struct lttng_handle *h, const char *channel_name); + +void thapi_omp_init(struct lttng_handle *h, const char *channel_name); +void thapi_omp_enable_tracing_events(struct lttng_handle *h, const char *channel_name); +void thapi_omp_disable_tracing_events(struct lttng_handle *h, const char *channel_name); + +void thapi_hip_init(struct lttng_handle *h, const char *channel_name); +void thapi_hip_enable_tracing_events(struct lttng_handle *h, const char *channel_name); +void thapi_hip_disable_tracing_events(struct lttng_handle *h, const char *channel_name); diff --git a/libthapictl/include/thapi-ctl.h b/libthapictl/include/thapi-ctl.h new file mode 100644 index 00000000..69999cef --- /dev/null +++ b/libthapictl/include/thapi-ctl.h @@ -0,0 +1,73 @@ +#pragma once + +#define THAPI_CTL_LOG_LEVEL_ERROR 1 +#define THAPI_CTL_LOG_LEVEL_INFO 2 +#define THAPI_CTL_LOG_LEVEL_WARN 3 +#define THAPI_CTL_LOG_LEVEL_DEBUG 4 + +#ifdef __cplusplus +extern "C" { +#endif + + +/** + * Initialize bookkeeping events for enabled backends, if run with iprof + * + * Does nothing if not run with iprof. This is detected using the + * THAPI_LTTNG_SESSION_ID environment variable. + * + * @param enable_tracing: if false, enable only bookkeeping events, otherwise + * enable all events + * + * @return 0 on success + */ +int thapi_ctl_init(); + + +/** + * Start tracing host and device events, when application is run with iprof. + * + * Does nothing if not run with iprof. This is detected using the + * THAPI_LTTNG_SESSION_ID environment variable. + * + * @return 0 on success + */ +int thapi_ctl_start(); + +/** + * Stop tracing host and device events, when application is run with iprof. + * + * Does nothing if not run with iprof. This is detected using the + * THAPI_LTTNG_SESSION_ID environment variable. + * + * @return 0 on success + */ +int thapi_ctl_stop(); + + +/** + * Destroy any lttng handles associated with thapi-ctl. + * + * @return 0 on success + */ +void thapi_ctl_destroy(); + + +/** + * Log to stderr according to configured log level. + * + * Log level is set with THAPI_CTL_LOG_LEVEL env variable. + * + * @param log_level: one of THAPI_CTL_LOG_LEVEL_(DEBUG|INFO|WARN|ERROR) + * @param fmt: printf style format + * @param ...: printf format arguments + */ +void thapi_ctl_log(int log_level, const char *fmt, ...); + + +void thapi_ctl_print_events(); + + +#ifdef __cplusplus +} +#endif diff --git a/libthapictl/src/backends.c b/libthapictl/src/backends.c new file mode 100644 index 00000000..1947fbfe --- /dev/null +++ b/libthapictl/src/backends.c @@ -0,0 +1,396 @@ +#include +#include +#include + +#include + +#include "thapi.h" +#include "thapi-ctl.h" + + +enum tracing_mode_e { + THAPI_CTL_TRACING_MODE_UNSET=0, + THAPI_CTL_TRACING_MODE_MINIMAL=1, + THAPI_CTL_TRACING_MODE_DEFAULT=2, + THAPI_CTL_TRACING_MODE_FULL=3, +}; +typedef enum tracing_mode_e tracing_mode_t; + + +#define ARRAY_LENGTH(a) (sizeof(a) / sizeof(*a)); + + +/* + * "lttng_ust_cuda:cuDriverGetVersion_entry", + "lttng_ust_cuda:cuDriverGetVersion_exit", + "lttng_ust_cuda:cuDriverGetAttribute_entry", + "lttng_ust_cuda:cuDriverGetAttribute_exit", + "lttng_ust_cuda:cuInit_entry", + "lttng_ust_cuda:cuInit_exit", + "lttng_ust_cuda:cuGetExportTable_entry", + "lttng_ust_cuda:cuGetExportTable_exit", + "lttng_ust_cuda:cuCtxGetCurrent_entry", + "lttng_ust_cuda:cuCtxGetCurrent_exit", + */ + +static char* cuda_bookkeeping_events[] = { + "lttng_ust_cuda:cuGetProcAddress_v2_entry", + "lttng_ust_cuda:cuGetProcAddress_v2_exit", + "lttng_ust_cuda:cuCtxCreate_entry", + "lttng_ust_cuda:cuCtxCreate_exit", + "lttng_ust_cuda:cuCtxCreate_v2_entry", + "lttng_ust_cuda:cuCtxCreate_v2_exit", + "lttng_ust_cuda:cuCtxCreate_v3_entry", + "lttng_ust_cuda:cuCtxCreate_v3_exit", + "lttng_ust_cuda:cuCtxDestroy_entry", + "lttng_ust_cuda:cuCtxDestroy_exit", + "lttng_ust_cuda:cuCtxDestroy_v2_entry", + "lttng_ust_cuda:cuCtxDestroy_v2_exit", + "lttng_ust_cuda:cuCtxSetCurrent_entry", + "lttng_ust_cuda:cuCtxSetCurrent_exit", + "lttng_ust_cuda:cuCtxPushCurrent_entry", + "lttng_ust_cuda:cuCtxPushCurrent_exit", + "lttng_ust_cuda:cuCtxPushCurrent_v2_entry", + "lttng_ust_cuda:cuCtxPushCurrent_v2_exit", + "lttng_ust_cuda:cuCtxPopCurrent_entry", + "lttng_ust_cuda:cuCtxPopCurrent_exit", + "lttng_ust_cuda:cuCtxPopCurrent_v2_entry", + "lttng_ust_cuda:cuCtxPopCurrent_v2_exit", + "lttng_ust_cuda:cuDevicePrimaryCtxRetain_entry", + "lttng_ust_cuda:cuDevicePrimaryCtxRetain_exit", + "lttng_ust_cuda:cuStreamCreate_entry", + "lttng_ust_cuda:cuStreamCreate_exit", + "lttng_ust_cuda:cuStreamCreateWithPriority_entry", + "lttng_ust_cuda:cuStreamCreateWithPriority_exit", + "lttng_ust_cuda:cuModuleGetFunction_entry", + "lttng_ust_cuda:cuModuleGetFunction_exit", + "lttng_ust_cuda_profiling:event_profiling_results", +}; + + +// TODO: encapsulate all env vars into a single struct +static tracing_mode_t thapi_ctl_tracing_mode() { + static tracing_mode_t mode = THAPI_CTL_TRACING_MODE_UNSET; + if (mode == THAPI_CTL_TRACING_MODE_UNSET) { + const char *mode_env = getenv("LTTNG_UST_TRACING_MODE"); + if (mode_env == NULL) { + mode = THAPI_CTL_TRACING_MODE_DEFAULT; + } else if (strcmp(mode_env, "minimal") == 0) { + mode = THAPI_CTL_TRACING_MODE_MINIMAL; + } else if (strcmp(mode_env, "full") == 0) { + mode = THAPI_CTL_TRACING_MODE_FULL; + } else if (strcmp(mode_env, "default") == 0) { + mode = THAPI_CTL_TRACING_MODE_DEFAULT; + } else { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_WARN, + "Unknown tracing mode: %s, using default", mode_env); + mode = THAPI_CTL_TRACING_MODE_DEFAULT; + } + } + return mode; +} + + +static int thapi_ctl_profiling() { + static int profiling = -1; + if (profiling == -1) { + const char *profiling_env = getenv("LTTNG_UST_PROFILING"); + if (profiling_env == NULL || strcmp(profiling_env, "1") == 0) { + profiling = 1; + } else { + profiling = 0; + } + } + return profiling; +} + + +static char* cuda_tracing_events[] = { + "lttng_ust_cuda:*", + "lttng_ust_cuda_properties", + "lttng_ust_cuda_profiling:event_profiling", +}; + + +static inline int is_wildcard(const char *event_pattern) { + return (strchr(event_pattern, '*') != NULL); +} + + +static void thapi_enable_events_by_pattern(struct lttng_handle* handle, + struct lttng_event* ev, + int n_patterns, + char** event_patterns, + const char* channel_name, + int exclusion_count, + char **exclusion_list) { + memset(ev, 0, sizeof(*ev)); + ev->type = LTTNG_EVENT_TRACEPOINT; + + for (int i = 0; i < n_patterns; i++) { + const char *pattern = event_patterns[i]; + // thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, "Enabling event '%s' on channel '%s'", + // pattern, channel_name); + strncpy(ev->name, pattern, LTTNG_SYMBOL_NAME_LEN); + ev->name[LTTNG_SYMBOL_NAME_LEN - 1] = '\0'; + int rval = 0; + if (exclusion_count > 0 && is_wildcard(pattern)) { + rval = lttng_enable_event_with_exclusions(handle, ev, channel_name, NULL, + exclusion_count, exclusion_list); + } else { + rval = lttng_enable_event_with_exclusions(handle, ev, channel_name, NULL, + 0, NULL); + } + + if (rval < 0 && rval != -LTTNG_ERR_UST_EVENT_ENABLED) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, + "Failed to enable event '%s': %d", + pattern, rval); + } + } +} + + +static void thapi_disable_events_by_pattern(struct lttng_handle* handle, + struct lttng_event* ev, + int n_patterns, + char** event_patterns, + const char* channel_name) { + memset(ev, 0, sizeof(*ev)); + ev->type = LTTNG_EVENT_ALL; // match any event type + ev->loglevel = -1; // match any log level + + for (int i = 0; i < n_patterns; i++) { + const char *pattern = event_patterns[i]; + // thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, "Disabling event '%s' on channel '%s'", + // pattern, channel_name); + strncpy(ev->name, pattern, LTTNG_SYMBOL_NAME_LEN); + ev->name[LTTNG_SYMBOL_NAME_LEN - 1] = '\0'; + int rval = lttng_disable_event_ext(handle, ev, channel_name, NULL); + if (rval < 0) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, + "Failed to disable event '%s': %d", + pattern, rval); + } + } +} + + +void thapi_cuda_init(struct lttng_handle *h, const char *channel_name) { + struct lttng_event *ev = lttng_event_create(); + if (ev == NULL) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event"); + return; + } + + int n_bookkeeping_events = ARRAY_LENGTH(cuda_bookkeeping_events); + thapi_enable_events_by_pattern(h, ev, n_bookkeeping_events, cuda_bookkeeping_events, + channel_name, 0, NULL); + + lttng_event_destroy(ev); +} + + +void thapi_cuda_enable_tracing_events(struct lttng_handle *h, const char *channel_name) { + struct lttng_event *ev = lttng_event_create(); + if (ev == NULL) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event"); + return; + } + + int n_bookkeeping_events = ARRAY_LENGTH(cuda_bookkeeping_events); + int n_tracing_events = ARRAY_LENGTH(cuda_tracing_events); + thapi_enable_events_by_pattern(h, ev, n_tracing_events, cuda_tracing_events, channel_name, + n_bookkeeping_events, cuda_bookkeeping_events); + + lttng_event_destroy(ev); +} + + +void thapi_cuda_disable_tracing_events(struct lttng_handle *h, const char *channel_name) { + struct lttng_event *ev = lttng_event_create(); + if (ev == NULL) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event"); + return; + } + + int n_tracing_events = sizeof(cuda_tracing_events) + / sizeof(*cuda_tracing_events); + thapi_disable_events_by_pattern(h, ev, n_tracing_events, + cuda_tracing_events, channel_name); + + lttng_event_destroy(ev); +} + + +static char* opencl_bookkeeping_events[] = { + "lttng_ust_opencl:clGetDeviceIDs", + "lttng_ust_opencl:clCreatSubDevices", + "lttng_ust_opencl:clCreateCommandQueue", + "lttng_ust_opencl_arguments:kernel_info", + "lttng_ust_opencl_profiling:event_profiling_results" +}; + +static char* opencl_tracing_default_events[] = { + "lttng_ust_opencl_devices:*", + "lttng_ust_opencl_arguments:*", + "lttng_ust_opencl_build:infos*", +}; + +static char* opencl_tracing_default_exclude_events[] = { + "lttng_ust_opencl:clSetKernelArg*" + "lttng_ust_opencl:clGetKernelArg*", + "lttng_ust_opencl:clSetKernelExecInfo*" + "lttng_ust_opencl:clGetKernelInfo*", + "lttng_ust_opencl:clGetMemAllocInfoINTEL*" +}; + +static char* opencl_profiling_events[] = { + "lttng_ust_opencl_profiling:event_profiling", +}; + + +void thapi_opencl_init(struct lttng_handle *h, const char *channel_name) { + struct lttng_event *ev = lttng_event_create(); + if (ev == NULL) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event"); + return; + } + + int n_bookkeeping_events = ARRAY_LENGTH(opencl_bookkeeping_events); + thapi_enable_events_by_pattern(h, ev, n_bookkeeping_events, opencl_bookkeeping_events, + channel_name, 0, NULL); + + lttng_event_destroy(ev); +} + + +void thapi_opencl_enable_tracing_events(struct lttng_handle *h, const char *channel_name) { + tracing_mode_t mode = thapi_ctl_tracing_mode(); + + struct lttng_event *ev = lttng_event_create(); + if (ev == NULL) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event"); + return; + } + + char *host_events_pattern = "lttng_ust_opencl:*"; + if (mode == THAPI_CTL_TRACING_MODE_MINIMAL) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_WARN, + "mode minimal no supproted by opencl backend, using default mode"); + mode = THAPI_CTL_TRACING_MODE_DEFAULT; + } + + if (mode == THAPI_CTL_TRACING_MODE_FULL) { + thapi_enable_events_by_pattern(h, ev, 1, &host_events_pattern, channel_name, + 0, NULL); + } else { // MODE_DEFAULT, add same pattern but with exclusion list + int n_exclude_default = ARRAY_LENGTH(opencl_tracing_default_exclude_events); + thapi_enable_events_by_pattern(h, ev, 1, &host_events_pattern, channel_name, + n_exclude_default, + opencl_tracing_default_exclude_events); + } + + int n_tracing_default = ARRAY_LENGTH(opencl_tracing_default_events); + thapi_enable_events_by_pattern(h, ev, n_tracing_default, opencl_tracing_default_events, + channel_name, 0, NULL); + + if (thapi_ctl_profiling()) { + int n_profiling = ARRAY_LENGTH(opencl_profiling_events); + thapi_enable_events_by_pattern(h, ev, n_profiling, opencl_profiling_events, + channel_name, 0, NULL); + } + + lttng_event_destroy(ev); +} + + +void thapi_opencl_disable_tracing_events(struct lttng_handle *h, const char *channel_name) { + struct lttng_event *ev = lttng_event_create(); + if (ev == NULL) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event"); + return; + } + + int n_tracing_default_events = sizeof(opencl_tracing_default_events) + / sizeof(*opencl_tracing_default_events); + thapi_disable_events_by_pattern(h, ev, n_tracing_default_events, + opencl_tracing_default_events, channel_name); + + char *host_events_pattern = "lttng_ust_opencl:*"; + thapi_disable_events_by_pattern(h, ev, 1, &host_events_pattern, channel_name); + + if (thapi_ctl_profiling()) { + int n_profiling = ARRAY_LENGTH(opencl_profiling_events); + thapi_disable_events_by_pattern(h, ev, n_profiling, opencl_profiling_events, + channel_name); + } + + lttng_event_destroy(ev); +} + + +static char *omp_events_pattern = "lttng_ust_ompt:*target*"; + + +void thapi_omp_init(struct lttng_handle *h, const char *channel_name) { + (void)h; + (void)channel_name; + // no-op, no bookkeeping events for omp + return; +} + + +void thapi_omp_enable_tracing_events(struct lttng_handle *h, const char *channel_name) { + struct lttng_event *ev = lttng_event_create(); + if (ev == NULL) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event"); + return; + } + thapi_enable_events_by_pattern(h, ev, 1, &omp_events_pattern, channel_name, 0, NULL); + lttng_event_destroy(ev); +} + + +void thapi_omp_disable_tracing_events(struct lttng_handle *h, const char *channel_name) { + struct lttng_event *ev = lttng_event_create(); + if (ev == NULL) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event"); + return; + } + thapi_disable_events_by_pattern(h, ev, 1, &omp_events_pattern, channel_name); + lttng_event_destroy(ev); +} + + +static char *hip_events_pattern = "lttng_ust_hip:*"; + + +void thapi_hip_init(struct lttng_handle *h, const char *channel_name) { + (void)h; + (void)channel_name; + // no-op, no bookkeeping events for hip + return; +} + + +void thapi_hip_enable_tracing_events(struct lttng_handle *h, const char *channel_name) { + struct lttng_event *ev = lttng_event_create(); + if (ev == NULL) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event"); + return; + } + thapi_enable_events_by_pattern(h, ev, 1, &hip_events_pattern, channel_name, 0, NULL); + lttng_event_destroy(ev); +} + + +void thapi_hip_disable_tracing_events(struct lttng_handle *h, const char *channel_name) { + struct lttng_event *ev = lttng_event_create(); + if (ev == NULL) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "Error creating event"); + return; + } + thapi_disable_events_by_pattern(h, ev, 1, &hip_events_pattern, channel_name); + lttng_event_destroy(ev); +} diff --git a/libthapictl/src/main.c b/libthapictl/src/main.c new file mode 100644 index 00000000..898f3f4a --- /dev/null +++ b/libthapictl/src/main.c @@ -0,0 +1,36 @@ +#include +#include +#include + +#include "thapi-ctl.h" + + +void print_usage(const char* argv0) { + fprintf(stderr, "Usage: %s init|start|stop\n", argv0); +} + + +int main(int argc, char **argv) { + if (argc != 2) { + print_usage(argv[0]); + exit(1); + } + + const char* cmd = argv[1]; + printf("cmd '%s'\n", cmd); + fflush(stdout); + if (strcmp(cmd, "init") == 0) { + thapi_ctl_init(); // enable bookkeeping + } else if (strcmp(cmd, "start") == 0) { + thapi_ctl_start(); + } else if (strcmp(cmd, "stop") == 0) { + thapi_ctl_stop(); + } else if (strcmp(cmd, "list") == 0) { + thapi_ctl_print_events(); + } else { + print_usage(argv[0]); + fprintf(stderr, "Error: unknown command '%s'\n", cmd); + return 1; + } + return 0; +} diff --git a/libthapictl/src/startstop.c b/libthapictl/src/startstop.c new file mode 100644 index 00000000..092419d7 --- /dev/null +++ b/libthapictl/src/startstop.c @@ -0,0 +1,371 @@ +#include +#include +#include +#include +#include + +#include + +#include "thapi.h" +#include "thapi-ctl.h" +#include "backends.h" + + +#define THAPI_LTTNG_SESSION_ID_ENV "LTTNG_UST_SESSION_ID" +#define THAPI_LTTNG_CHANNEL_NAME_ENV "LTTNG_UST_CHANNEL_NAME" + + +#define THAPI_CTL_DEFAULT_LOG_LEVEL THAPI_CTL_LOG_LEVEL_ERROR + + +static const char* thapi_ctl_level_str[5] = { + "", + "ERROR", + "INFO", + "WARN", + "DEBUG" +}; + + +static int thapi_ctl_log_level() { + static int level = -1; + if (level < 0) { + char *log_level_str = getenv("THAPI_CTL_LOG_LEVEL"); + if (log_level_str == NULL || strlen(log_level_str) == 0) { + level = THAPI_CTL_DEFAULT_LOG_LEVEL; + } else { + char *endptr; + level = strtol(log_level_str, &endptr, 10); + if (*endptr != '\0') { + level = THAPI_CTL_DEFAULT_LOG_LEVEL; + } + if (level > THAPI_CTL_LOG_LEVEL_DEBUG || level < THAPI_CTL_LOG_LEVEL_ERROR) { + fprintf(stderr, + "libthapi-ctl[WARN]: unknown log level '%s', falling back to ERROR", + log_level_str); + level = THAPI_CTL_DEFAULT_LOG_LEVEL; + } + } + } + return level; +} + + +static int* thapi_ctl_enabled_backends() { + static int enabled_backends[] = {-1, -1, -1, -1, -1, -1, -1 }; + if (enabled_backends[BACKEND_UNKNOWN] == -1) { + enabled_backends[BACKEND_UNKNOWN] = 0; + enabled_backends[BACKEND_ZE] = (getenv("LTTNG_UST_BACKEND_ZE_ENABLED") != NULL); + enabled_backends[BACKEND_OPENCL] = + (getenv("LTTNG_UST_BACKEND_OPENCL_ENABLED") != NULL); + enabled_backends[BACKEND_CUDA] = (getenv("LTTNG_UST_BACKEND_CUDA_ENABLED") != NULL); + enabled_backends[BACKEND_OMP_TARGET_OPERATIONS] = + (getenv("LTTNG_UST_BACKEND_OMP_TARGET_OPERATIONS_ENABLED") != NULL); + enabled_backends[BACKEND_OMP] = (getenv("LTTNG_UST_BACKEND_OMP_ENABLED") != NULL); + enabled_backends[BACKEND_HIP] = (getenv("LTTNG_UST_BACKEND_HIP_ENABLED") != NULL); + } + return enabled_backends; +} + + +static void log_events(struct lttng_handle* handle, const char *channel_name, + const char *prefix) { + int ret; + struct lttng_event *event_list = NULL; + struct lttng_event *event = NULL; + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_INFO, "channel name %s", channel_name); + int n_events = lttng_list_events(handle, channel_name, &event_list); + if (n_events < 0) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "[%s] error getting events: %d", + prefix, n_events); + return; + } + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, "[%s] %d events", prefix, n_events); + const char *filter_str; + for (int i = 0; i < n_events; i++) { + event = &event_list[i]; + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, "[%s] %s (enabled: %d)", + prefix, event->name, event->enabled); + ret = lttng_event_get_filter_expression(event, &filter_str); + if (ret) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "can't get filter expr %d", ret); + } else if (filter_str) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, "[%s] + filter str '%s'", + prefix, filter_str); + } + + int n_exclusions = lttng_event_get_exclusion_name_count(event); + if (n_exclusions < 0) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, + "can't get exclusion name count %d", n_exclusions); + } else if (n_exclusions > 0) { + for (int j = 0; j < n_exclusions; j++) { + const char *exclusion_name; + ret = lttng_event_get_exclusion_name(event, j, &exclusion_name); + if (ret < 0) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, + "can't get exclusion name[%d] %d", j, ret); + } else { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, "[%s] + exclude %02d '%s'", + prefix, j, exclusion_name); + } + } + } + } + if (event_list != NULL) + free(event_list); +} + + +void thapi_ctl_log(int log_level, const char *fmt, ...) { + assert(log_level >= THAPI_CTL_LOG_LEVEL_ERROR && log_level <= THAPI_CTL_LOG_LEVEL_DEBUG); + int config_level = thapi_ctl_log_level(); + if (log_level <= config_level) { + fprintf(stderr, "libthapi-ctl[%s]: ", thapi_ctl_level_str[log_level]); + va_list va; + va_start(va, fmt); + vfprintf(stderr, fmt, va); + va_end(va); + fprintf(stderr, "\n"); + } + if (log_level >= THAPI_CTL_LOG_LEVEL_DEBUG) { + fflush(stderr); + } +} + + +static const char* thapi_ctl_lttng_session_id() { + static char *session_id = NULL; + if (session_id == NULL) { + // first call, initialize static vars + session_id = getenv(THAPI_LTTNG_SESSION_ID_ENV); + if (session_id == NULL) { + // not called with iprof + session_id = "\0"; + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, + "No env var '%s', application not launched using iprof, disable", + THAPI_LTTNG_SESSION_ID_ENV); + } else { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, + "THAPI session id: %s", session_id); + } + } + return session_id; +} + + +static const char* thapi_ctl_lttng_channel_name() { + static char *channel_name = NULL; + if (channel_name == NULL) { + // first call, initialize static vars + channel_name = getenv(THAPI_LTTNG_CHANNEL_NAME_ENV); + if (channel_name == NULL) { + // not called with iprof + channel_name = "\0"; + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, + "No env var '%s', application not launched using iprof, disable", + THAPI_LTTNG_CHANNEL_NAME_ENV); + } else { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, + "THAPI channel: %s", channel_name); + } + } + return channel_name; +} + + +static struct lttng_handle* thapi_ctl_create_lttng_handle(const char *session_id) { + struct lttng_handle* handle = NULL; + if (strlen(session_id) > 0) { + struct lttng_domain dom; + memset(&dom, 0, sizeof(dom)); + dom.type = LTTNG_DOMAIN_UST; + dom.buf_type = LTTNG_BUFFER_PER_UID; + handle = lttng_create_handle(session_id, &dom); + if (handle == NULL) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, + "Failed to create lttng handle for session '%s'", + session_id); + + } + } + return handle; +} + + +void thapi_ctl_destroy_lttng_handle(struct lttng_handle *handle) { + if (handle != NULL) { + lttng_destroy_handle(handle); + } +} + + +int thapi_ctl_init() { + const char *session_id = thapi_ctl_lttng_session_id(); + const char *channel_name = thapi_ctl_lttng_channel_name(); + struct lttng_handle *handle = thapi_ctl_create_lttng_handle(session_id); + if (handle == NULL) + return 0; + + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, "init backends '%s' (channel '%s'", + session_id, channel_name); + log_events(handle, channel_name, "before"); + int *backends_enabled = thapi_ctl_enabled_backends(); + if (backends_enabled[BACKEND_CUDA]) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_INFO, "cuda enabled"); + thapi_cuda_init(handle, channel_name); + } else { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_INFO, "cuda DISABLED"); + } + if (backends_enabled[BACKEND_OPENCL]) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_INFO, "opencl enabled"); + thapi_opencl_init(handle, channel_name); + } else { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_INFO, "opencl DISABLED"); + } + if (backends_enabled[BACKEND_OMP]) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_INFO, "omp enabled"); + thapi_omp_init(handle, channel_name); + } else { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_INFO, "omp DISABLED"); + } + if (backends_enabled[BACKEND_HIP]) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_INFO, "hip enabled"); + thapi_hip_init(handle, channel_name); + } else { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_INFO, "hip DISABLED"); + } + log_events(handle, channel_name, "after"); + thapi_ctl_destroy_lttng_handle(handle); + return 0; +} + + +int thapi_ctl_start() { + const char *session_id = thapi_ctl_lttng_session_id(); + const char *channel_name = thapi_ctl_lttng_channel_name(); + struct lttng_handle *handle = thapi_ctl_create_lttng_handle(session_id); + if (handle == NULL) + return 0; + + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, "start tracing '%s' (channel '%s')", + session_id, channel_name); + log_events(handle, channel_name, "before"); + int *backends_enabled = thapi_ctl_enabled_backends(); + if (backends_enabled[BACKEND_CUDA]) { + thapi_cuda_enable_tracing_events(handle, channel_name); + } + if (backends_enabled[BACKEND_OPENCL]) { + thapi_opencl_enable_tracing_events(handle, channel_name); + } + if (backends_enabled[BACKEND_OMP]) { + thapi_omp_enable_tracing_events(handle, channel_name); + } + if (backends_enabled[BACKEND_HIP]) { + thapi_hip_enable_tracing_events(handle, channel_name); + } + log_events(handle, channel_name, "after"); + thapi_ctl_destroy_lttng_handle(handle); + return 0; +} + + +int thapi_ctl_stop() { + const char *session_id = thapi_ctl_lttng_session_id(); + const char *channel_name = thapi_ctl_lttng_channel_name(); + struct lttng_handle *handle = thapi_ctl_create_lttng_handle(session_id); + if (handle == NULL) + return 0; + + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_DEBUG, "stop tracing '%s' (channel '%s')", + session_id, channel_name); + log_events(handle, channel_name, "before"); + + int *backends_enabled = thapi_ctl_enabled_backends(); + if (backends_enabled[BACKEND_CUDA]) { + thapi_cuda_disable_tracing_events(handle, channel_name); + } + if (backends_enabled[BACKEND_OPENCL]) { + thapi_opencl_disable_tracing_events(handle, channel_name); + } + if (backends_enabled[BACKEND_OMP]) { + thapi_omp_disable_tracing_events(handle, channel_name); + } + if (backends_enabled[BACKEND_HIP]) { + thapi_hip_disable_tracing_events(handle, channel_name); + } + log_events(handle, channel_name, "after"); + thapi_ctl_destroy_lttng_handle(handle); + return 0; +} + + +void thapi_ctl_print_events() { + const char *session_id = thapi_ctl_lttng_session_id(); + const char *channel_name = thapi_ctl_lttng_channel_name(); + struct lttng_handle *handle = thapi_ctl_create_lttng_handle(session_id); + if (handle == NULL) + return; + + int ret; + struct lttng_event *event_list = NULL; + int n_events = lttng_list_events(handle, channel_name, &event_list); + if (n_events < 0) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, "error getting events: %d", + n_events); + return; + } + printf("%d events\n", n_events); + const char *filter_str; + for (int i = 0; i < n_events; i++) { + struct lttng_event *event = &event_list[i]; + printf("%s (enabled: %d)", event->name, event->enabled); + ret = lttng_event_get_filter_expression(event, &filter_str); + if (ret != 0) { + fprintf(stderr, "Error getting filter expression: %d", ret); + } else if (filter_str) { + printf(" (filter '%s')", filter_str); + } + + int n_exclusions = lttng_event_get_exclusion_name_count(event); + if (n_exclusions < 0) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, + "can't get exclusion name count %d", n_exclusions); + } else if (n_exclusions > 0) { + // make room for max event name lengths plus commas and null terminator + size_t exclusions_string_max_length = n_exclusions * (LTTNG_SYMBOL_NAME_LEN + 1); + char *exclusions_string = malloc(exclusions_string_max_length); + assert(exclusions_string != NULL); + char *exclusions_offset = exclusions_string; + for (int j = 0; j < n_exclusions; j++) { + const char *exclusion_name; + ret = lttng_event_get_exclusion_name(event, j, &exclusion_name); + if (ret < 0) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, + "can't get exclusion name[%d] %d", j, ret); + break; + } else { + int nchar = snprintf(exclusions_offset, LTTNG_SYMBOL_NAME_LEN, + "%s", exclusion_name); + if (nchar < 0 || nchar >= LTTNG_SYMBOL_NAME_LEN) { + thapi_ctl_log(THAPI_CTL_LOG_LEVEL_ERROR, + "Error formatting event exclusions: ", j, nchar); + break; + } else { + exclusions_offset += nchar; + } + if (j < n_exclusions - 1) { + sprintf(exclusions_offset, ","); + exclusions_offset++; + } + } + assert(exclusions_offset < (exclusions_string + exclusions_string_max_length)); + *exclusions_offset = '\0'; + } + printf(" (exclude '%s')", exclusions_string); + } + printf("\n"); + } + if (event_list != NULL) + free(event_list); +} diff --git a/libthapictl/tests/Makefile b/libthapictl/tests/Makefile new file mode 100644 index 00000000..10276e3b --- /dev/null +++ b/libthapictl/tests/Makefile @@ -0,0 +1,19 @@ + +.PHONY: all +all: hello hello-ze + +hello-tp.o: hello-tp.c hello-tp.h + gcc -c -I. hello-tp.c + +hello.o: hello.c + gcc -c -I../ hello.c + +hello: hello.o hello-tp.o + gcc -o hello hello.o hello-tp.o -llttng-ust -ldl -L../ -lthapi-ctl -llttng-ctl + +hello-ze: hello-ze.cxx + icpx -fsycl -o $@ $< -I../ -llttng-ctl -Wl,-rpath,$(THAPI_HOME)/lib -L$(THAPI_HOME)/lib -lthapi-ctl + +.PHONY: clean +clean: + rm -rf hello *.o diff --git a/libthapictl/tests/test-thapi-ctl.c b/libthapictl/tests/test-thapi-ctl.c new file mode 100644 index 00000000..2a23d098 --- /dev/null +++ b/libthapictl/tests/test-thapi-ctl.c @@ -0,0 +1,19 @@ +#include + +#include "thapi-ctl.h" + + +int main(int argc, char **argv) { + (void) argc; + (void) argv; + // TODO: is there something we can do here that requires no "backed"? + // maybe can do omp CPU? + // for now just makes sure can compile and run without blowing up + for (int i = 0; i < 10; i++) { + if (i % 2 == 1) + thapi_ctl_start(); + printf("loop %d\n", i); + if (i % 2 == 1) + thapi_ctl_stop(); + } +} diff --git a/utils/Makefile.am b/utils/Makefile.am index b5df32e9..266a5e47 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -4,6 +4,9 @@ else WERROR = endif +include_HEADERS = \ + include/thapi.h + noinst_HEADERS = \ include/utarray.h \ include/uthash.h \ diff --git a/utils/include/thapi.h b/utils/include/thapi.h new file mode 100644 index 00000000..08830eca --- /dev/null +++ b/utils/include/thapi.h @@ -0,0 +1,13 @@ +#pragma once + +enum backend_e { + BACKEND_UNKNOWN = 0, + BACKEND_ZE = 1, + BACKEND_OPENCL = 2, + BACKEND_CUDA = 3, + BACKEND_OMP_TARGET_OPERATIONS = 4, + BACKEND_OMP = 5, + BACKEND_HIP = 6, +}; +typedef enum backend_e backend_t; +typedef unsigned backend_level_t; diff --git a/utils/xprof_utils.hpp b/utils/xprof_utils.hpp index d5f88d90..cefba4c8 100644 --- a/utils/xprof_utils.hpp +++ b/utils/xprof_utils.hpp @@ -12,17 +12,7 @@ #include #include -enum backend_e { - BACKEND_UNKNOWN = 0, - BACKEND_ZE = 1, - BACKEND_OPENCL = 2, - BACKEND_CUDA = 3, - BACKEND_OMP_TARGET_OPERATIONS = 4, - BACKEND_OMP = 5, - BACKEND_HIP = 6, -}; -typedef enum backend_e backend_t; -typedef unsigned backend_level_t; +#include "thapi.h" // Should be ordered by the value of the backend_e const std::unordered_map pretty_backend_name_g = { diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index 9047363d..e3f2547a 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -49,9 +49,40 @@ def exec(cmd, opts: {}, debug: true) LOGGER.warn { stderr_str.strip } unless stderr_str.empty? LOGGER.debug { stdout_str.strip } unless stdout_str.empty? + stdout_str end +def lttng_env() + { 'LTTNG_HOME' => lttng_home_dir } +end + +# TODO: should we just apply this to ENV once and use everywhere? +def thapi_env(backends, tracing_mode, trace_from_start, profiling) + rval = lttng_env.dup + rval.merge!({ + 'LTTNG_UST_SESSION_ID' => lttng_session_uuid, + 'LTTNG_UST_CHANNEL_NAME' => lttng_blocking_channel_name, + 'LTTNG_UST_TRACING_MODE' => tracing_mode, + 'LTTNG_UST_TRACE_FROM_START' => trace_from_start.to_s, + # TODO: combine with LTTNG_UST_{be}_PROFILE??? + 'LTTNG_UST_PROFILING' => profiling ? "1" : "0" + }) + backends.each do |name| + rval["LTTNG_UST_BACKEND_#{name.upcase}_ENABLED"] = '1' + end + rval +end + +def exec_thapi_ctl(command, backends, tracing_mode, trace_from_start, profiling) + ENV.merge!(thapi_env(backends, tracing_mode, trace_from_start, profiling)) + # exec("lttng list --channel #{channel_name} --userspace #{lttng_session_uuid}") + # exec("#{BINDIR}/thapi-ctl list") + exec("#{BINDIR}/thapi-ctl #{command}") + # exec("#{BINDIR}/thapi-ctl list") + # exec("lttng list --channel #{channel_name} --userspace #{lttng_session_uuid}") +end + # _ # |_ |\ | \ / # |_ | \| \/ @@ -322,6 +353,10 @@ def lttng_session_uuid Digest::MD5.hexdigest(lttng_trace_dir_tmp) end +def lttng_blocking_channel_name() + 'blocking-channel' +end + def sampling? return false unless OPTIONS[:sample] @@ -429,7 +464,8 @@ def launch_usr_bin(env, cmd) end end -def enable_events_ze(channel_name, tracing_mode: 'default', profiling: true) +def enable_events_ze(channel_name, tracing_mode: 'default', profiling: true, + trace_from_start: true) lttng_enable = "lttng enable-event --userspace --session=#{lttng_session_uuid} --channel=#{channel_name}" case tracing_mode when 'minimal' @@ -464,7 +500,23 @@ def enable_events_ze(channel_name, tracing_mode: 'default', profiling: true) end end -def enable_events_cl(channel_name, tracing_mode: 'default', profiling: true) +def enable_events_cl(channel_name, tracing_mode: 'default', profiling: true, + trace_from_start: true) + exec_thapi_ctl("init", ["opencl"], tracing_mode, trace_from_start, profiling) + if trace_from_start + exec_thapi_ctl("start", ["opencl"], tracing_mode, trace_from_start, profiling) + end + return + + # lttng_enable_bookkeeping = "lttng enable-event --userspace --session=#{lttng_session_uuid_bookkeeping} --channel=#{channel_name}" + # bookkeeping_events = [ + # "lttng_ust_opencl:clGetDeviceIDs", + # "lttng_ust_opencl:clCreatSubDevices", + # "lttng_ust_opencl:clCreateCommandQueue", + # "lttng_ust_opencl_arguments:kernel_info", + # ] + # exec("#{lttng_enable_bookkeeping} #{bookkeeping_events.join(',')}") + # lttng_enable = "lttng enable-event --userspace --session=#{lttng_session_uuid} --channel=#{channel_name}" case tracing_mode when 'full' @@ -494,24 +546,92 @@ def enable_events_cl(channel_name, tracing_mode: 'default', profiling: true) end end -def enable_events_cuda(channel_name, tracing_mode: 'default', profiling: true) +def enable_events_cuda(channel_name, tracing_mode: 'default', profiling: true, + trace_from_start: true) + exec_thapi_ctl("init", ["cuda"], tracing_mode, trace_from_start, profiling) + if trace_from_start + exec_thapi_ctl("start", ["cuda"], tracing_mode, trace_from_start, profiling) + end + # exec("lttng list --channel #{channel_name} --userspace #{lttng_session_uuid}") + return + + + # lttng_enable_bookkeeping = "lttng enable-event --userspace --session=#{lttng_session_uuid_bookkeeping} --channel=#{channel_name}" + # required_events = [ + # "lttng_ust_cuda:cuGetProcAddress_v2_entry", + # "lttng_ust_cuda:cuGetProcAddress_v2_exit", + # ].join(',') + # #"lttng_ust_cuda:cuGetExportTable_entry", + # #"lttng_ust_cuda:cuGetExportTable_exit" + # #"lttng_ust_cuda:cuGetProcAddress_v2_entry", + # #"lttng_ust_cuda:cuGetProcAddress_v2_exit", + # #"lttng_ust_cuda:cuDeviceGetCount_entry", + # #"lttng_ust_cuda:cuDeviceGetCount_exit", + # #"lttng_ust_cuda:cuDeviceGet_entry", + # #"lttng_ust_cuda:cuDeviceGet_exit", + # #"lttng_ust_cuda:cuDeviceGetName_entry", + # #"lttng_ust_cuda:cuDeviceGetName_exit", + # bookkeeping_events = [ + # "lttng_ust_cuda:cuCtxCreate_entry", + # "lttng_ust_cuda:cuCtxCreate_exit", + # "lttng_ust_cuda:cuCtxCreate_v2_entry", + # "lttng_ust_cuda:cuCtxCreate_v2_exit", + # "lttng_ust_cuda:cuCtxCreate_v3_entry", + # "lttng_ust_cuda:cuCtxCreate_v3_exit", + # "lttng_ust_cuda:cuCtxDestroy_entry", + # "lttng_ust_cuda:cuCtxDestroy_exit", + # "lttng_ust_cuda:cuCtxDestroy_v2_entry", + # "lttng_ust_cuda:cuCtxDestroy_v2_exit", + # "lttng_ust_cuda:cuCtxSetCurrent_entry", + # "lttng_ust_cuda:cuCtxSetCurrent_exit", + # "lttng_ust_cuda:cuCtxPushCurrent_entry", + # "lttng_ust_cuda:cuCtxPushCurrent_exit", + # "lttng_ust_cuda:cuCtxPushCurrent_v2_entry", + # "lttng_ust_cuda:cuCtxPushCurrent_v2_exit", + # "lttng_ust_cuda:cuCtxPopCurrent_entry", + # "lttng_ust_cuda:cuCtxPopCurrent_exit", + # "lttng_ust_cuda:cuCtxPopCurrent_v2_entry", + # "lttng_ust_cuda:cuCtxPopCurrent_v2_exit", + # "lttng_ust_cuda:cuDevicePrimaryCtxRetain_entry", + # "lttng_ust_cuda:cuDevicePrimaryCtxRetain_exit", + # "lttng_ust_cuda:cuStreamCreate_entry", + # "lttng_ust_cuda:cuStreamCreate_exit", + # "lttng_ust_cuda:cuStreamCreateWithPriority_entry", + # "lttng_ust_cuda:cuStreamCreateWithPriority_exit", + # "lttng_ust_cuda:cuModuleGetFunction_entry", + # "lttng_ust_cuda:cuModuleGetFunction_exit" + # ].join(',') + # exec("#{lttng_enable_bookkeeping} #{bookkeeping_events},#{required_events}") + # lttng_enable = "lttng enable-event --userspace --session=#{lttng_session_uuid} --channel=#{channel_name}" exec("#{lttng_enable} lttng_ust_cuda:*") + exec("#{lttng_enable} #{required_events}") exec("#{lttng_enable} lttng_ust_cuda_properties") exec("#{lttng_enable} lttng_ust_cuda_profiling:*") if profiling end -def enable_events_hip(channel_name, tracing_mode: 'default', profiling: true) - lttng_enable = "lttng enable-event --userspace --session=#{lttng_session_uuid} --channel=#{channel_name}" - exec("#{lttng_enable} lttng_ust_hip:*") +def enable_events_hip(channel_name, tracing_mode: 'default', profiling: true, + trace_from_start: true) + # lttng_enable = "lttng enable-event --userspace --session=#{lttng_session_uuid} --channel=#{channel_name}" + # exec("#{lttng_enable} lttng_ust_hip:*") + exec_thapi_ctl("init", ["hip"], tracing_mode, trace_from_start, profiling) + if trace_from_start + exec_thapi_ctl("start", ["hip"], tracing_mode, trace_from_start, profiling) + end end -def enable_events_omp(channel_name, tracing_mode: 'default', profiling: true) - lttng_enable = "lttng enable-event --userspace --session=#{lttng_session_uuid} --channel=#{channel_name}" - exec("#{lttng_enable} lttng_ust_ompt:*target*") +def enable_events_omp(channel_name, tracing_mode: 'default', profiling: true, + trace_from_start: true) + # lttng_enable = "lttng enable-event --userspace --session=#{lttng_session_uuid} --channel=#{channel_name}" + # exec("#{lttng_enable} lttng_ust_ompt:*target*") + exec_thapi_ctl("init", ["omp"], tracing_mode, trace_from_start, profiling) + if trace_from_start + exec_thapi_ctl("start", ["omp"], tracing_mode, trace_from_start, profiling) + end end -def enable_events_metadata(channel_name, tracing_mode: 'default', profiling: true) +def enable_events_metadata(channel_name, tracing_mode: 'default', profiling: true, + trace_from_start: true) lttng_enable = "lttng enable-event --userspace --session=#{lttng_session_uuid} --channel=#{channel_name}" exec("#{lttng_enable} lttng_ust_thapi:*") end @@ -528,7 +648,8 @@ def setup_lttng(backends) File.write(File.join(lttng_trace_dir_tmp, 'thapi_metadata.yaml'), { type: 'lttng' }.to_yaml) - channel_name = 'blocking-channel' + channel_name = lttng_blocking_channel_name + exec("lttng enable-channel --userspace --session=#{lttng_session_uuid} --blocking-timeout=inf #{channel_name}") exec("lttng add-context --userspace --session=#{lttng_session_uuid} --channel=#{channel_name} -t vpid -t vtid") @@ -536,7 +657,8 @@ def setup_lttng(backends) (backends + ['metadata']).each do |name| send("enable_events_#{name}", channel_name, tracing_mode: OPTIONS[:'tracing-mode'], - profiling: OPTIONS[:profile]) + profiling: OPTIONS[:profile], + trace_from_start: OPTIONS[:'trace-from-start']) end if OPTIONS.include?(:sample) @@ -626,15 +748,17 @@ def on_node_processing_sync_and_rename(syncd) end # Start, Stop lttng, amd do the on-node analsysis -def trace_and_on_node_processing(usr_argv) +def trace_and_on_node_processing(usr_argv, options) # All masters set a future global barrier Sync_daemon.open do |syncd| # Load Tracers and APILoaders Lib backends, h = env_tracers - # All ranks need to set the LLTTNG_HOME env - # so they can have access to the daemon - ENV['LTTNG_HOME'] = lttng_home_dir + # TODO: set in one place, or pass per command? + ENV.merge!(thapi_env(backends, options[:'tracing-mode'], + options[:'trace-from-start'], + options[:'profile'])) + # Only local master spawn LTTNG daemon and start session setup_lttng(backends) if mpi_local_master? syncd.local_barrier('waiting_for_lttng_setup') @@ -744,6 +868,8 @@ if __FILE__ == $PROGRAM_NAME end parser.on('--[no-]profile', 'Trigger for device activities profiling', default: true) parser.on('--[no-]analysis', 'Trigger for analysis', default: true) + parser.on('--[no-]trace-from-start', 'Disable tracing at start (default enabled)', + default: true) # General Options parser.on('-b', '--backends BACKENDS', Array, "Select which and how backends' need to handled.", @@ -819,7 +945,7 @@ if __FILE__ == $PROGRAM_NAME # Right now, `replay` means no tracing. # But we don't have a way of disabling post-processing - folder = OPTIONS.include?(:replay) ? OPTIONS[:replay] || last_trace_saved : trace_and_on_node_processing(ARGV) + folder = OPTIONS.include?(:replay) ? OPTIONS[:replay] || last_trace_saved : trace_and_on_node_processing(ARGV, options) if mpi_master? warn("THAPI: Trace location: #{folder}") global_processing(folder) if OPTIONS[:analysis]