Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add libthapi-ctl to start/stop collecting #233

Open
wants to merge 30 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
cb3a044
add libthapi-ctl
bd4 Jun 7, 2024
94d4659
xprof: separate bookkeeping events for cuda
bd4 Jun 7, 2024
45f92f2
WIP gtensor int test
bd4 Jun 9, 2024
21cffca
libthapi-ctl: logging with THAPI_CTL_LOG_LEVEL
bd4 Jun 10, 2024
c387ae0
cuda: fix bookkeeping events
bd4 Jun 13, 2024
beb1e54
doxygen comments
bd4 Jun 13, 2024
32b12dc
WIP cuda event based enable/disable
bd4 Jun 14, 2024
20a70f5
clean up WIP cuda event based start/stop
bd4 Jun 20, 2024
a7adec5
log message clean up
bd4 Jun 20, 2024
4dbbe2a
fix
bd4 Jun 20, 2024
b3ddb0c
fix
bd4 Jun 24, 2024
32bddbe
cuda exclusions, cleanup
bd4 Jun 24, 2024
41ef9ff
fix exclude print
bd4 Jun 24, 2024
f567f4c
rename env vars, fix cuda, TRACE_FROM_START env var
bd4 Jun 25, 2024
bf35bc5
working opencl start/stop
bd4 Jun 25, 2024
fefd4b8
make test names match new arg name
bd4 Jun 25, 2024
9b3fd78
WIP better thapi-ctl int tests
bd4 Jun 26, 2024
aafc959
fix cl int test
bd4 Jun 27, 2024
23f280f
missing header
bd4 Jun 27, 2024
803e64f
add opencl fn to header
bd4 Jun 27, 2024
2fd0a71
libthapictl: use autotools dep for lttng-ctl
bd4 Jun 27, 2024
aea464b
WIP omp start/stop, int tests
bd4 Jun 27, 2024
fb6e46f
libthapictl: add design readme
bd4 Jun 27, 2024
72b57fb
ci: use lttng ppa to get latest stable
bd4 Jun 27, 2024
743757c
fix libthapictl header missing in dist
bd4 Jun 28, 2024
33ae513
xprof: make thapi-ctl work w/o PATH mod
bd4 Jun 28, 2024
4ea5771
ci: fix thapictl emtpy sync daemon
bd4 Jun 28, 2024
018682e
working gtensor int test for cuda
bd4 Jun 28, 2024
edf028b
use thapi-ctl fro hip
bd4 Jun 28, 2024
3abe9a4
improve gtensor int tests, add readme
bd4 Jun 29, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 21 additions & 8 deletions .github/workflows/presubmit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ jobs:
with:
path: ~/babeltrace2/2.0.4
key: ${{ runner.os }}-build-${{ env.cache-name }}
- run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler
- run: sudo apt-add-repository ppa:lttng/stable-2.13
- run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler
if: steps.babeltrace2.outputs.cache-hit != 'true'
- run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
if: steps.babeltrace2.outputs.cache-hit != 'true'
Expand Down Expand Up @@ -69,7 +70,8 @@ jobs:
with:
path: ~/babeltrace2/2.0.4
key: ${{ runner.os }}-build-${{ env.cache-name }}
- run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
- run: sudo apt-add-repository ppa:lttng/stable-2.13
- run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
- run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
- name: Load Babeltrace2
run: |
Expand Down Expand Up @@ -112,7 +114,8 @@ jobs:
with:
path: ~/babeltrace2/2.0.4
key: ${{ runner.os }}-build-${{ env.cache-name }}
- run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler
- run: sudo apt-add-repository ppa:lttng/stable-2.13
- run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler
- run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
- name: Load Babeltrace2
run: |
Expand Down Expand Up @@ -146,9 +149,12 @@ jobs:
include:
- bats_file: 'iprof'
thapi_sync_daemon: 'fs' # Need to force, if not `THAPI_SYNC_DAEMON` will be set to empty list
- bats_file: 'libthapi-ctl'
thapi_sync_daemon: 'fs'
env:
THAPI_TEST_BIN: clinfo
THAPI_BIN_DIR: ./build/ici/bin
THAPI_BIN_DIR: ${{ github.workspace }}/build/ici/bin
THAPI_INSTALL_DIR: ${{ github.workspace }}/build/ici
THAPI_SYNC_DAEMON: ${{ matrix.thapi_sync_daemon }}
steps:
- uses: actions/checkout@v4
Expand All @@ -165,12 +171,16 @@ jobs:
name: thapi-bin
- name: Untar THAPI
run: tar -xvf thapi.tar
- run: sudo apt update; sudo apt install -y lttng-tools liblttng-ust-dev ruby ruby-dev libprotobuf-dev libpocl2 clinfo bats coreutils
- run: sudo apt-add-repository ppa:lttng/stable-2.13
- run: sudo apt update; sudo apt install -y lttng-tools liblttng-ust-dev ruby ruby-dev libprotobuf-dev libpocl2 clinfo bats coreutils liblttng-ctl-dev ocl-icd-opencl-dev jq
- run: sudo gem install babeltrace2 opencl_ruby_ffi
- name: Load Babeltrace2
run: |
echo "$HOME/babeltrace2/2.0.4/bin" >> $GITHUB_PATH
echo "LD_LIBRARY_PATH=$HOME/babeltrace2/2.0.4/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV
- name: ldconfig
run: |
ldconfig -p | grep libOpenCL
- name: integration test ${{ matrix.bats_file }}
run: |
bats integration_tests/${{ matrix.bats_file }}.bats
Expand All @@ -189,7 +199,8 @@ jobs:
with:
path: ~/babeltrace2/2.0.4
key: ${{ runner.os }}-build-${{ env.cache-name }}
- run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
- run: sudo apt-add-repository ppa:lttng/stable-2.13
- run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
- run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
- name: Load Babeltrace2
run: |
Expand Down Expand Up @@ -227,7 +238,8 @@ jobs:
with:
path: ~/babeltrace2/2.0.4
key: ${{ runner.os }}-build-${{ env.cache-name }}
- run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
- run: sudo apt-add-repository ppa:lttng/stable-2.13
- run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
- run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
- name: Load Babeltrace2
run: |
Expand Down Expand Up @@ -259,7 +271,8 @@ jobs:
with:
path: ~/babeltrace2/2.0.4
key: ${{ runner.os }}-build-${{ env.cache-name }}
- run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
- run: sudo apt-add-repository ppa:lttng/stable-2.13
- run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev liblttng-ctl-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind
- run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
- name: Load Babeltrace2
run: |
Expand Down
2 changes: 1 addition & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ACLOCAL_AMFLAGS = -I m4
SUBDIRS = utils xprof sampling opencl ze cuda omp hip
SUBDIRS = utils xprof sampling opencl ze cuda omp hip libthapictl

EXTRA_DIST = autogen.sh README.md .valgrind/dlopen.supp
2 changes: 2 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ AM_CONDITIONAL([FOUND_MPI], [test "x$found_mpi" = xyes])
PKG_CHECK_MODULES([LIBFFI], [libffi >= 3.2])
PKG_CHECK_MODULES([BABELTRACE2], [babeltrace2 >= 2.0])
PKG_CHECK_MODULES([LTTNG_UST], [lttng-ust >= 2.10])
PKG_CHECK_MODULES([LTTNG_CTL], [lttng-ctl >= 2.13.13])
PKG_CHECK_MODULES([PROTOBUF], [protobuf >= 3.0])

AX_RUBY_EXTENSION([cast-to-yaml], [yes])
Expand Down Expand Up @@ -114,6 +115,7 @@ AC_CONFIG_FILES([
hip/Makefile
xprof/Makefile
utils/Makefile
libthapictl/Makefile
])
AC_CONFIG_FILES([utils/test_wrapper_thapi_text_pretty.sh], [chmod +x utils/test_wrapper_thapi_text_pretty.sh])
AC_CONFIG_FILES([opencl/tracer_opencl.sh], [chmod +x opencl/tracer_opencl.sh])
Expand Down
2 changes: 1 addition & 1 deletion cuda/cuda_model.rb
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def upper_snake_case(str)
fullname = "#{name}#{versions.size - i > 1 ? "_v#{versions.size - i}" : ""}#{suffix}"
fullname = "#{name}_v2#{suffix}" unless command_names.include?(fullname)
sstr = <<EOF
if (tracepoint_enabled(lttng_ust_cuda, #{fullname}_#{START}) #{suffixes.size > 1 ? "&& #{suffix ? "pt_condition" : "normal_condition" } " : ""}&& cudaVersion >= #{version} && strcmp(symbol, "#{name}") == 0) {
if ((!_trace_from_start() || tracepoint_enabled(lttng_ust_cuda, #{fullname}_#{START})) #{suffixes.size > 1 ? "&& #{suffix ? "pt_condition" : "normal_condition" } " : ""}&& cudaVersion >= #{version} && strcmp(symbol, "#{name}") == 0) {
wrap_#{fullname}(pfn);
}
EOF
Expand Down
9 changes: 9 additions & 0 deletions cuda/tracer_cuda_helpers.include.c
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,15 @@ static inline void _primary_context_reset(CUdevice dev) {
pthread_mutex_unlock(&_primary_contexts_mutex);
}

static int _trace_from_start() {
static int trace_from_start = -1;
if (trace_from_start == -1) {
const char *env_value = getenv("LTTNG_UST_TRACE_FROM_START");
trace_from_start = (env_value == NULL || strcmp(env_value, "1") == 0);
}
return trace_from_start;
}

static void THAPI_ATTRIBUTE_DESTRUCTOR
_lib_cleanup() {
if (_do_cleanup) {
Expand Down
37 changes: 37 additions & 0 deletions integration_tests/gtensor.bats
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env bats

setup_file() {
export IPROF=$THAPI_INSTALL_DIR/bin/iprof

bdir="build-$GTENSOR_DEVICE"

(cd integration_tests/gtensor;
cmake -S . -B $bdir -DGTENSOR_DEVICE=$GTENSOR_DEVICE \
-DTHAPI_PATH=$THAPI_INSTALL_DIR \
&& cmake --build $bdir)

export THAPI_TEST_BIN=$(pwd)/integration_tests/gtensor/$bdir/axpy_start_stop
export LD_LIBRARY_PATH=$THAPI_INSTALL_DIR/lib:$LD_LIBRARY_PATH
}

teardown_file() {
rm -rf $THAPI_HOME/thapi-traces
}

@test "axpy-${GTENSOR_DEVICE}-trace-from-start" {
$IPROF --debug 0 $THAPI_TEST_BIN $TEST_ARGS
# NOTE: gtensor kernel names are complex, but all start with gt::, look it up rather
# than hard coding and making it fragile. Includes quotes.
kernel_name=$($IPROF -r -j | jq '.device.data | keys[]' | grep 'gt::')
write_count=$($IPROF -r -j | jq ".device.data | .[${kernel_name}].call")
# 0th call, plus 5 odd calls
[ $write_count -eq 6 ]
}

@test "axpy-${GTENSOR_DEVICE}-no-trace-from-start" {
$IPROF --no-trace-from-start --debug 0 $THAPI_TEST_BIN $TEST_ARGS
kernel_name=$($IPROF -r -j | jq '.device.data | keys[]' | grep 'gt::')
write_count=$($IPROF -r -j | jq ".device.data | .[${kernel_name}].call")
# 5 odd calls, does not include 0 since profiling should be disabled at start
[ $write_count -eq 5 ]
}
38 changes: 38 additions & 0 deletions integration_tests/gtensor/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
cmake_minimum_required(VERSION 3.23 FATAL_ERROR)

project(thapi-integration-test)

if(GTENSOR_DEVICE STREQUAL "cuda")
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
endif()

option(USE_THAPI_CTL "use thapi-ctl to start/stop tracing" ON)
set(THAPI_PATH "/opt/thapi" CACHE STRING "Path to thapi installation")

message(STATUS "${PROJECT_NAME}: THAPI_PATH=${THAPI_PATH}")

# add dependencies
include(cmake/CPM.cmake)
CPMFindPackage(NAME gtensor
GITHUB_REPOSITORY wdmapp/gtensor
GIT_TAG "main"
OPTIONS "GTENSOR_ENABLE_BLAS ON"
"GTENSOR_ENABLE_SOLVER ON")

add_executable(axpy_start_stop)
target_gtensor_sources(axpy_start_stop PRIVATE axpy_start_stop.cxx)
target_link_libraries(axpy_start_stop gtensor::gtensor)

if (${USE_THAPI_CTL})
find_package(PkgConfig REQUIRED)
pkg_check_modules(LTTNG_CTL REQUIRED lttng-ctl)

add_library(thapi_ctl INTERFACE IMPORTED)
target_include_directories(thapi_ctl INTERFACE "${THAPI_PATH}/include")
target_link_libraries(thapi_ctl INTERFACE "${THAPI_PATH}/lib/libthapi-ctl.so")
target_link_libraries(thapi_ctl INTERFACE ${LTTNG_CTL_LDFLAGS})
target_link_libraries(axpy_start_stop thapi_ctl)
else()
target_compile_definitions(axpy_start_stop PRIVATE NO_THAPI_CTL)
endif()
13 changes: 13 additions & 0 deletions integration_tests/gtensor/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# gtensor integration tests

Allows testing cuda, hip, and ze backends with single source file.

# Usage

```
GTENSOR_DEVICE=cuda bats ./integration_tests/gtensor.bats
```
Or "hip" or "sycl"

Note that THAPI hip backend does not yet do device profiling, modifications will need to be
made for tests to pass on hip.
105 changes: 105 additions & 0 deletions integration_tests/gtensor/axpy_start_stop.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@

#include <iostream>

#include <gtensor/gtensor.h>

#ifndef NO_THAPI_CTL
#include "thapi-ctl.h"
#endif

// provides convenient shortcuts for common gtensor functions, for example
// underscore ('_') to represent open slice ends.
using namespace gt::placeholders;

/**
* Templated function for daxpy that can work with host gtensor or device
* gtensor. Relies on C++11 mandatory copy elision to move the result into the
* LHS it is assigned to, rather than copying all the data (very important for
* large arrays). Input arrays are passed by reference, again to avoid copying
* all the data which would happen by default (just like std::vector, gtensor
* has copy semantics by default).
*/
template <typename Ex, typename Ey>
auto daxpy(double a, const Ex& x, const Ey& y)
{
// The expression 'a * x + y' generates a gfunction template, which is an
// un-evaluated expression. In this case, we force evaluation in the return
// statementso a new gtensor is created. It can be useful to not add the
// `gt::eval`, in which case the un-evaluated expression will be returned and
// can be combined with other operations before being evaluated, or assigned
// to a gtensor to force immediate evaluation.
return gt::eval(a * x + y);
}

int main(int argc, char** argv)
{
int n = 1024 * 1024;
int nprint = 32;

double a = 0.5;

// Define and allocate two 1d vectors of size n on the host. Declare
// but don't allocate a third 1d host vector for storing the result.
gt::gtensor<double, 1, gt::space::host> h_x(gt::shape(n));
gt::gtensor<double, 1, gt::space::host> h_y = gt::empty_like(h_x);
gt::gtensor<double, 1, gt::space::host> h_axpy;

// initialize the vectors, x is twice it's index values and y is equal
// to it's index values. We will perform .5*x + y, so the result should be
// axpy(i) = 2i.
for (int i = 0; i < n; i++) {
h_x(i) = 2.0 * static_cast<double>(i);
h_y(i) = static_cast<double>(i);
}

#ifndef GTENSOR_HAVE_DEVICE
#error "device required"
#endif

// Define and allocate device versions of h_x and h_y, and declare
// a varaible for the result on gpu.
gt::gtensor<double, 1, gt::space::device> d_x(gt::shape(n));
gt::gtensor<double, 1, gt::space::device> d_y = gt::empty_like(d_x);
gt::gtensor<double, 1, gt::space::device> d_axpy;

// Explicit copies of input from host to device. Note that this is an
// overload of the copy function for gtensor and gtensor_span types, not
// std::copy which has a different signature. The source is the first
// argument and destination the second argument. Currently thrust::copy is
// used under the hood in the implementation.
gt::copy(h_x, d_x);
gt::copy(h_y, d_y);

// This automatically generates a computation kernel to run on the
// device.
auto expr = a * d_x + d_y;

for (int i = 0; i < 10; i++) {
#ifndef NO_THAPI_CTL
if (i % 2 == 1) {
thapi_ctl_start();
}
#endif

d_axpy = gt::eval(expr);
h_axpy = gt::empty_like(h_x);
gt::copy(d_axpy, h_axpy);
gt::synchronize();

std::cout << "axpy(10) = " << h_axpy(10) << std::endl;

#ifndef NO_THAPI_CTL
if (i % 2 == 1) {
thapi_ctl_stop();
}
#endif
}

// Define a slice to print a subset of elements for spot checking the
// result.
auto print_slice = gt::gslice(_, _, n / nprint);
std::cout << "a = " << a << std::endl;
std::cout << "x = " << h_x.view(print_slice) << std::endl;
std::cout << "y = " << h_y.view(print_slice) << std::endl;
std::cout << "a*x + y = " << h_axpy.view(print_slice) << std::endl;
}
24 changes: 24 additions & 0 deletions integration_tests/gtensor/cmake/CPM.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# SPDX-License-Identifier: MIT
#
# SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors

set(CPM_DOWNLOAD_VERSION 0.39.0)
set(CPM_HASH_SUM "66639bcac9dd2907b2918de466783554c1334446b9874e90d38e3778d404c2ef")

if(CPM_SOURCE_CACHE)
set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
elseif(DEFINED ENV{CPM_SOURCE_CACHE})
set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
else()
set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
endif()

# Expand relative path. This is important if the provided path contains a tilde (~)
get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE)

file(DOWNLOAD
https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM}
)

include(${CPM_DOWNLOAD_LOCATION})
Loading