From 0b50b1ad43a4f6bc82be298e4c7be9ee569a5242 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Thu, 19 Sep 2024 16:17:26 -0500 Subject: [PATCH 1/4] Remove MPI_Wtime* from default trace mode (#290) Co-authored-by: Thomas Applencourt --- xprof/xprof.rb.in | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index e2aed6d5..9b115c23 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -517,7 +517,12 @@ end def enable_events_mpi(channel_name, tracing_mode: 'default', profiling: true) lttng_enable = "lttng enable-event --userspace --session=#{lttng_session_uuid} --channel=#{channel_name}" - exec("#{lttng_enable} lttng_ust_mpi:*") + case tracing_mode + when 'full' + exec("#{lttng_enable} lttng_ust_mpi:*") + when 'default' + exec("#{lttng_enable} lttng_ust_mpi:* -x lttng_ust_mpi:MPI_WTime*") + end exec("#{lttng_enable} lttng_ust_mpi_type:*") end From dcd4aede0eef66adb2642d751da2587df6f97928 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Wed, 4 Sep 2024 13:22:14 -0500 Subject: [PATCH 2/4] Dependency update (#286) Use dependency from Efficios deliverable. --------- Co-authored-by: Thomas Applencourt --- .github/workflows/presubmit.yml | 203 ++++++++++++++++++-------------- ze/Makefile.am | 6 +- 2 files changed, 122 insertions(+), 87 deletions(-) diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml index 0901bb69..6ec61195 100644 --- a/.github/workflows/presubmit.yml +++ b/.github/workflows/presubmit.yml @@ -1,6 +1,9 @@ name: Presubmit on: [push, pull_request] +env: + APT_PACKAGE: gcc g++ ruby ruby-dev elfutils libelf-dev libpopt-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind libglib2.0-dev libnuma-dev liburcu-dev + jobs: pre_job: runs-on: ubuntu-24.04 @@ -15,65 +18,93 @@ jobs: paths_ignore: '["**/README.md"]' do_not_skip: '["pull_request"]' - babeltrace2: + efficios_dep: needs: pre_job if: ${{ needs.pre_job.outputs.should_skip != 'true' }} - name: Build and cache Babeltrace2 + name: Build and Cache Efficios Dependencies runs-on: ubuntu-24.04 steps: - uses: actions/cache@v4 - id: babeltrace2 + id: efficios_dep env: - cache-name: cache-babeltrace2 + cache-name: cache-efficios_dep with: - path: ~/babeltrace2/2.0.5 + path: ~/efficios_dep/ key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler libglib2.0-dev - if: steps.babeltrace2.outputs.cache-hit != 'true' - - run: wget https://www.efficios.com/files/babeltrace/babeltrace2-2.0.5.tar.bz2 - if: steps.babeltrace2.outputs.cache-hit != 'true' - - run: tar -xjvf babeltrace2-2.0.5.tar.bz2 - if: steps.babeltrace2.outputs.cache-hit != 'true' - - run: | - wget https://github.com/argonne-lcf/THAPI-spack/raw/main/packages/babeltrace2/d2d2e6cc.patch - patch -p1 < d2d2e6cc.patch - if: steps.babeltrace2.outputs.cache-hit != 'true' - working-directory: babeltrace2-2.0.5 - - run: mkdir -p babeltrace2-2.0.5/build - if: steps.babeltrace2.outputs.cache-hit != 'true' - - run: ../configure --prefix=$HOME/babeltrace2/2.0.5 - if: steps.babeltrace2.outputs.cache-hit != 'true' - working-directory: babeltrace2-2.0.5/build - - run: make -j - if: steps.babeltrace2.outputs.cache-hit != 'true' - working-directory: babeltrace2-2.0.5/build - - run: make -j install - if: steps.babeltrace2.outputs.cache-hit != 'true' - working-directory: babeltrace2-2.0.5/build + - name: Set PKG_CONFIG + run: | + echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV + if: steps.efficios_dep.outputs.cache-hit != 'true' + - run: sudo apt update; sudo apt install -y $APT_PACKAGE + if: steps.efficios_dep.outputs.cache-hit != 'true' + # lttng-ust + - run: git clone https://github.com/lttng/lttng-ust + if: steps.efficios_dep.outputs.cache-hit != 'true' + - name: Install lttng-ust + run: | + # Avoid https://github.com/lttng/lttng-ust/commit/b187bcd5d99cde54dececee0e5028524d55aa314 who change the signature of + # lttng_ust_ctl_recv_register_event used by lttng-tool anl-ms3 + git checkout 4f8afc535e77070f1ef00434674f0417c6f9ef69 + ./bootstrap + ./configure --disable-man-pages --prefix=$HOME/efficios_dep/ + make -j$(nproc) + make install + working-directory: lttng-ust + if: steps.efficios_dep.outputs.cache-hit != 'true' + # lttng-tools need lttng-ust 2.14+ + - run: git clone -b anl-ms3 git://git.efficios.com/deliverable/lttng-tools.git + if: steps.efficios_dep.outputs.cache-hit != 'true' + - name: Install lttng-tools + run: | + ./bootstrap + ./configure --disable-man-pages --disable-bin-lttng-crash --prefix=$HOME/efficios_dep + make -j$(nproc) + make install + #Todo, this will need to be added in the spack repo as a patch + echo "#!/usr/bin/env python"| cat - dirwatch.py > $HOME/efficios_dep/bin/dirwatch.py + chmod 755 $HOME/efficios_dep/bin/dirwatch.py + working-directory: lttng-tools + if: steps.efficios_dep.outputs.cache-hit != 'true' + # babeltrace + - run: git clone -b anl-ms3 git://git.efficios.com/deliverable/babeltrace.git + if: steps.efficios_dep.outputs.cache-hit != 'true' + - name: Install Babeltrace + run: | + #Todo, grab file from Spack + wget https://raw.githubusercontent.com/argonne-lcf/THAPI/53262fcaaaf45d7d475884d7e63b69abe47e41d6/.github/workflows/str_nullptr.patch + patch -p1 < str_nullptr.patch + wget https://raw.githubusercontent.com/argonne-lcf/THAPI/4418916620496fd66cde0b3d5e241bed0a4c18a3/.github/workflows/bt_makefile.patch + patch -p1 < bt_makefile.patch + ./bootstrap + ./configure --disable-man-pages --prefix=$HOME/efficios_dep + make -j$(nproc) + make install + working-directory: babeltrace + if: steps.efficios_dep.outputs.cache-hit != 'true' build-and-check: - needs: [babeltrace2, pre_job] + needs: [efficios_dep, pre_job] if: ${{ needs.pre_job.outputs.should_skip != 'true' }} name: Build and Check ubuntu-24.04 runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - uses: actions/cache@v4 - id: babeltrace2 + id: efficios_dep env: - cache-name: cache-babeltrace2 + cache-name: cache-efficios_dep with: - path: ~/babeltrace2/2.0.5 + path: ~/efficios_dep/ key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind libglib2.0-dev + - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | - echo "$HOME/babeltrace2/2.0.5/bin" >> $GITHUB_PATH - echo "PKG_CONFIG_PATH=$HOME/babeltrace2/2.0.5/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV - echo "CPATH=$HOME/babeltrace2/2.0.5/include:$CPATH" >> $GITHUB_ENV - echo "LD_LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV - echo "LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LIBRARY_PATH" >> $GITHUB_ENV + echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH + echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV + echo "CPATH=$HOME/efficios_dep/include:$CPATH" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=$HOME/efficios_dep/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV + echo "LIBRARY_PATH=$HOME/efficios_dep/lib:$LIBRARY_PATH" >> $GITHUB_ENV - run: mkdir -p build - run: ./autogen.sh - run: ../configure @@ -94,7 +125,7 @@ jobs: build/**/tests/*.log install-with-mpi: - needs: [babeltrace2, pre_job] + needs: [efficios_dep, pre_job] if: ${{ needs.pre_job.outputs.should_skip != 'true' }} name: Install with MPI daemon support runs-on: ubuntu-24.04 @@ -102,23 +133,23 @@ jobs: - uses: actions/checkout@v4 - uses: mpi4py/setup-mpi@v1 with: - mpi: intelmpi + mpi: intelmpi - uses: actions/cache@v4 - id: babeltrace2 + id: efficios_dep env: - cache-name: cache-babeltrace2 + cache-name: cache-efficios_dep with: - path: ~/babeltrace2/2.0.5 + path: ~/efficios_dep/ key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler libglib2.0-dev + - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | - echo "$HOME/babeltrace2/2.0.5/bin" >> $GITHUB_PATH - echo "PKG_CONFIG_PATH=$HOME/babeltrace2/2.0.5/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV - echo "CPATH=$HOME/babeltrace2/2.0.5/include:$CPATH" >> $GITHUB_ENV - echo "LD_LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV - echo "LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LIBRARY_PATH" >> $GITHUB_ENV + echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH + echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV + echo "CPATH=$HOME/efficios_dep/include:$CPATH" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=$HOME/efficios_dep/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV + echo "LIBRARY_PATH=$HOME/efficios_dep/lib:$LIBRARY_PATH" >> $GITHUB_ENV - run: mkdir -p build - run: ./autogen.sh - run: ../configure --prefix=`pwd`/ici @@ -133,7 +164,7 @@ jobs: path: thapi.tar integration-tests: - needs: [babeltrace2, pre_job, install-with-mpi] + needs: [efficios_dep, pre_job, install-with-mpi] if: ${{ needs.pre_job.outputs.should_skip != 'true' }} name: Integration ${{ matrix.bats_file }} ${{matrix.thapi_sync_daemon }} runs-on: ubuntu-24.04 @@ -146,50 +177,50 @@ jobs: with: mpi: intelmpi - uses: actions/cache@v4 - id: babeltrace2 + id: efficios_dep env: - cache-name: cache-babeltrace2 + cache-name: cache-efficios_dep with: - path: ~/babeltrace2/2.0.5 + path: ~/efficios_dep/ key: ${{ runner.os }}-build-${{ env.cache-name }} - uses: actions/download-artifact@v4 with: name: thapi-bin - name: Untar THAPI run: tar -xvf thapi.tar - - run: sudo apt update; sudo apt install -y lttng-tools liblttng-ust-dev ruby ruby-dev libprotobuf-dev libpocl2 clinfo bats coreutils libglib2.0-dev + - run: sudo apt update; sudo apt install -y $APT_PACKAGE bats coreutils libpocl2 clinfo - run: sudo gem install babeltrace2 opencl_ruby_ffi - name: Load Babeltrace2 run: | - echo "$HOME/babeltrace2/2.0.5/bin" >> $GITHUB_PATH - echo "LD_LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV + echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH + echo "LD_LIBRARY_PATH=$HOME/efficios_dep/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV - name: Integration test run: | bats integration_tests/ - build-in-tree: - needs: [babeltrace2, pre_job] + build-in-tree-and-check: + needs: [efficios_dep, pre_job] if: ${{ needs.pre_job.outputs.should_skip != 'true' }} name: Build in Tree ubuntu-24.04 runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - uses: actions/cache@v4 - id: babeltrace2 + id: efficios_dep env: - cache-name: cache-babeltrace2 + cache-name: cache-efficios_dep with: - path: ~/babeltrace2/2.0.5 + path: ~/efficios_dep/ key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind libglib2.0-dev + - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | - echo "$HOME/babeltrace2/2.0.5/bin" >> $GITHUB_PATH - echo "PKG_CONFIG_PATH=$HOME/babeltrace2/2.0.5/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV - echo "CPATH=$HOME/babeltrace2/2.0.5/include:$CPATH" >> $GITHUB_ENV - echo "LD_LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV - echo "LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LIBRARY_PATH" >> $GITHUB_ENV + echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH + echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV + echo "CPATH=$HOME/efficios_dep/include:$CPATH" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=$HOME/efficios_dep/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV + echo "LIBRARY_PATH=$HOME/efficios_dep/lib:$LIBRARY_PATH" >> $GITHUB_ENV - run: ./autogen.sh - run: ./configure - run: make -j @@ -206,28 +237,28 @@ jobs: ./**/tests/*.log distcheck: - needs: [babeltrace2, pre_job] + needs: [efficios_dep, pre_job] if: ${{ needs.pre_job.outputs.should_skip != 'true' }} name: Distcheck ubuntu-24.04 runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - uses: actions/cache@v4 - id: babeltrace2 + id: efficios_dep env: - cache-name: cache-babeltrace2 + cache-name: cache-efficios_dep with: - path: ~/babeltrace2/2.0.5 + path: ~/efficios_dep/ key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind libglib2.0-dev + - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | - echo "$HOME/babeltrace2/2.0.5/bin" >> $GITHUB_PATH - echo "PKG_CONFIG_PATH=$HOME/babeltrace2/2.0.5/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV - echo "CPATH=$HOME/babeltrace2/2.0.5/include:$CPATH" >> $GITHUB_ENV - echo "LD_LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV - echo "LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LIBRARY_PATH" >> $GITHUB_ENV + echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH + echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV + echo "CPATH=$HOME/efficios_dep/include:$CPATH" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=$HOME/efficios_dep/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV + echo "LIBRARY_PATH=$HOME/efficios_dep/lib:$LIBRARY_PATH" >> $GITHUB_ENV - run: mkdir -p build - run: ./autogen.sh - run: ../configure @@ -238,28 +269,28 @@ jobs: THAPI_VALGRIND: 1 dist-check: - needs: [babeltrace2, pre_job] + needs: [efficios_dep, pre_job] if: ${{ needs.pre_job.outputs.should_skip != 'true' }} name: Dist and Check ubuntu-24.04 runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - uses: actions/cache@v4 - id: babeltrace2 + id: efficios_dep env: - cache-name: cache-babeltrace2 + cache-name: cache-efficios_dep with: - path: ~/babeltrace2/2.0.5 + path: ~/efficios_dep/ key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind libglib2.0-dev + - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | - echo "$HOME/babeltrace2/2.0.5/bin" >> $GITHUB_PATH - echo "PKG_CONFIG_PATH=$HOME/babeltrace2/2.0.5/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV - echo "CPATH=$HOME/babeltrace2/2.0.5/include:$CPATH" >> $GITHUB_ENV - echo "LD_LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV - echo "LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LIBRARY_PATH" >> $GITHUB_ENV + echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH + echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV + echo "CPATH=$HOME/efficios_dep/include:$CPATH" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=$HOME/efficios_dep/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV + echo "LIBRARY_PATH=$HOME/efficios_dep/lib:$LIBRARY_PATH" >> $GITHUB_ENV - run: mkdir -p build - run: ./autogen.sh - run: ../configure diff --git a/ze/Makefile.am b/ze/Makefile.am index 20d14200..3e06da70 100644 --- a/ze/Makefile.am +++ b/ze/Makefile.am @@ -1,7 +1,11 @@ .DELETE_ON_ERROR: if STRICT - WERROR = -Werror + # We disable `nonnull` check due to + # ././ze_tracepoints.h: In function 'lttng_ust__event_get_size__lttng_ust_ze___zeModuleCreate_entry': + # [...]/lttng/ust-tracepoint-event.h:578:17: error: argument 1 null where non-null expected [-Werror=nonnull] + # 578 | strlen((_src) ? (_src) : LTTNG_UST__NULL_STRING) + 1; + WERROR = -Werror -Wno-error=nonnull else WERROR = endif From 481dd72a30701b5a16f853f33f4ad7a407544bec Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Wed, 4 Sep 2024 15:36:13 -0500 Subject: [PATCH 3/4] Add archive (#287) Enable usage of session rotation for lossless online trace consumption. --------- Co-authored-by: Thomas Applencourt --- .github/workflows/presubmit.yml | 8 ++-- integration_tests/general.bats | 4 ++ utils/babeltrace_thapi.in | 9 ++++ xprof/xprof.rb.in | 79 ++++++++++++++++++++++++++++----- 4 files changed, 86 insertions(+), 14 deletions(-) diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml index 6ec61195..58eaa44b 100644 --- a/.github/workflows/presubmit.yml +++ b/.github/workflows/presubmit.yml @@ -190,7 +190,7 @@ jobs: run: tar -xvf thapi.tar - run: sudo apt update; sudo apt install -y $APT_PACKAGE bats coreutils libpocl2 clinfo - run: sudo gem install babeltrace2 opencl_ruby_ffi - - name: Load Babeltrace2 + - name: Load Efficios Dependencies run: | echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH echo "LD_LIBRARY_PATH=$HOME/efficios_dep/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV @@ -214,7 +214,7 @@ jobs: key: ${{ runner.os }}-build-${{ env.cache-name }} - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - - name: Load Babeltrace2 + - name: Load Efficios Dependencies run: | echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV @@ -252,7 +252,7 @@ jobs: key: ${{ runner.os }}-build-${{ env.cache-name }} - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - - name: Load Babeltrace2 + - name: Load Efficios Dependencies run: | echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV @@ -284,7 +284,7 @@ jobs: key: ${{ runner.os }}-build-${{ env.cache-name }} - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - - name: Load Babeltrace2 + - name: Load Efficios Dependencie run: | echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV diff --git a/integration_tests/general.bats b/integration_tests/general.bats index fbeac370..2279905e 100644 --- a/integration_tests/general.bats +++ b/integration_tests/general.bats @@ -22,6 +22,10 @@ teardown_file() { rm out.pftrace } +@test "archive_summary" { + $IPROF --archive $THAPI_TEST_BIN +} + @test "replay_summary" { $IPROF $THAPI_TEST_BIN $IPROF -r diff --git a/utils/babeltrace_thapi.in b/utils/babeltrace_thapi.in index fe344861..08125a0a 100755 --- a/utils/babeltrace_thapi.in +++ b/utils/babeltrace_thapi.in @@ -130,6 +130,7 @@ def get_components(names) components_classes = { 'source.ctf.fs' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('fs'), 'source.ctf.lttng_live' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('lttng-live'), + 'source.ctf.lttng_archive' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('lttng-archive'), 'filter.utils.muxer' => BT2::BTPlugin.find('utils').get_filter_component_class_by_name('muxer'), 'sink.text.pretty' => BT2::BTPlugin.find('text').get_sink_component_class_by_name('pretty'), 'sink.ctf.fs' => BT2::BTPlugin.find('ctf').get_sink_component_class_by_name('fs'), @@ -200,6 +201,10 @@ def get_and_add_components(graph, names, l_inputs) graph.add(comp, 'source_live', params: { 'inputs' => $options[:inputs], 'session-not-found-action' => 'end' }) + when 'source.ctf.lttng_archive' + graph.add(comp, 'source_archive', + params: { 'session-name' => $options[:archive], + 'session-found-file-path' => $options[:'archive-session-found-file-path'] }) when 'source.ctf.fs' s = Find.find(*l_inputs) .reject { |path| FileTest.directory?(path) } @@ -281,6 +286,8 @@ def bt_graphs(inputs) @bt_graphs[inputs] ||= begin g_comps = [if $options[:live] 'source.ctf.lttng_live' + elsif $options[:archive] + 'source.ctf.lttng_archive' else 'source.ctf.fs' end] @@ -354,6 +361,8 @@ class BabeltraceParserThapi < OptionParserWithDefaultAndValidation 'Format: backend_name[:backend_level],...', default: ['mpi:3', 'omp:2', 'cl:1', 'ze:1', 'cuda:1', 'hip:1']) on('--debug', default: false) + on('--archive SESSION-NAME') + on('--archive-session-found-file-path PATH') on('--[no-]muxer') on('-v', '--version', 'Print the version string') do puts File.read(File.join(DATADIR, 'version')) diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index 9b115c23..b098e99b 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -13,6 +13,11 @@ PREFIX = '@prefix@' DATAROOTDIR = File.join(PREFIX, 'share') DATADIR = DATAROOTDIR +LTTNG_ARCHIVE_SIZE = '50M' +LTTNG_ARCHIVE_TIMER = '60s' +LTTNG_DIRWATCH_SIZE = '500' # In MiB +LTTNG_DIRWATCH_LOCK_RETRY_DELAY = 0.1 + $LOAD_PATH.unshift(DATADIR) if File.directory?(DATADIR) require 'open3' require 'fileutils' @@ -311,6 +316,7 @@ class Sync_daemon raise ensure return unless f + f.global_barrier f.finalize end @@ -565,16 +571,24 @@ def lm_setup_lttng(backends) end end + # This is required to force the creation of an trace, + # so that dirwatch doesn't complain about empty trace + if OPTIONS[:archive] + exec("lttng enable-rotation --session=#{lttng_session_uuid} --size=#{LTTNG_ARCHIVE_SIZE} --timer=#{LTTNG_ARCHIVE_TIMER}") + end exec("lttng start #{lttng_session_uuid}") end def lm_lttng_teardown_session raise unless mpi_local_master? + + exec("lttng rotate #{lttng_session_uuid}") if OPTIONS[:archive] exec("lttng destroy #{lttng_session_uuid}") end def lm_lttng_kill_sessiond raise unless mpi_local_master? + # Need to kill the sessiond Daemon. It's safe because each job has their own # # In theory, opening the lttng-sessiond.pid file is racy. @@ -601,7 +615,27 @@ def lm_babeltrace(backends) opts << "--output #{thapi_trace_dir_tmp}" opts << "--backends #{backends.join(',')}" opts << '--no-discard-metadata' if type == 'aggreg' && OPTIONS.include?(:'kernel-verbose') - exec("#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}") + + if OPTIONS[:archive] + read_file = File.join(lttng_trace_dir_tmp, 'bt_archive_ready') + opts << "--archive #{lttng_session_uuid} --archive-session-found-file-path=#{read_file}" + cmd = "#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}" + LOGGER.debug(cmd) + pid_bt = spawn(cmd) + + cmd = "dirwatch.py --log-level=CRITICAL #{lttng_session_uuid} #{LTTNG_DIRWATCH_SIZE}" + LOGGER.debug(cmd) + pid_dirwatch = spawn(cmd) + + until File.exist?(read_file) + # Ensure that dirwatch.py didn't crash, and deadlock + Process.wait(pid_dirwatch, Process::WNOHANG) + sleep(LTTNG_DIRWATCH_LOCK_RETRY_DELAY) + end + [pid_bt, pid_dirwatch] + else + exec("#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}") + end end # _ @@ -609,7 +643,7 @@ end # | | (_) (_ (/_ _> _> | | | (_| # _| -# Some naming convension +# Some naming convention # lm == function executed only local_master # gm == function executed only global_master @@ -619,6 +653,11 @@ def lm_move_to_shared if OPTIONS.include?(:trace) || !OPTIONS[:analysis] # The Apps finished, lttng finished, need to move to the shared tmp folder FileUtils.mkdir_p(File.dirname(thapi_trace_dir_tmp)) + # NOTE: I don't understand `mv` + # File.mv(a, b) will put a into b (aka a/b) + # FileUtils.rename(a,b) will move a as b, but may + # raise Invalid cross-device error. + # So we use `exec(mv -T a b)`, this have the added benefice of logging exec("mv #{lttng_trace_dir_tmp} #{thapi_trace_dir_tmp}") else # `lm_babeltrace` finished, can remove `tmp` folder @@ -634,7 +673,7 @@ def gm_rename_folder # Replace it with a better name, and update the root metadata. thapi_trace_dir_tmp_root = File.dirname(thapi_trace_dir_tmp) - # Because of `traced-rank`, `mpi_master` may not have any trace avalaible, + # Because of `traced-rank`, `mpi_master` may not have any trace available, # so find the first hostname who have a metadata FileUtils.cp(Dir.glob("#{thapi_trace_dir_tmp_root}/*/thapi_metadata.yaml").first, File.join(thapi_trace_dir_tmp_root, 'thapi_metadata.yaml')) @@ -658,26 +697,45 @@ def trace_and_on_node_processing(usr_argv) # All ranks need to set the LLTTNG_HOME env # so they can have access to the daemon ENV['LTTNG_HOME'] = lttng_home_dir - # Only local master spawn LTTNG daemon and start session - lm_setup_lttng(backends) if mpi_local_master? + LOGGER.debug("LTTNG_HOME = #{ENV.fetch('LTTNG_HOME', nil)}") + + # Only local master spawn daemons (lttng, and babeltrace) + # and the start the lttng-session + pids = if mpi_local_master? + lm_setup_lttng(backends) + lm_babeltrace(backends) if OPTIONS[:archive] + end + # Other local node cannot start before lttng and the daemon syncd.local_barrier('waiting_for_lttng_setup') # Launch User Command launch_usr_bin(h, usr_argv) - - # We need to be sure that all the local ranks are finished + # We need to ensure that all the local ranks have finished + # running the user application # before the local master stops the lttng session syncd.local_barrier('waiting_for_application_ending') + + # Everything from now on, is some local-master processing + # The `Sync_daemon` context will handle the call to the global barrier + # for the early exiting ranks return unless mpi_local_master? - # Stop Lttng session + # Stop Lttng session and babeltrace daemons lm_lttng_teardown_session - # Lttng session is finished, + if OPTIONS[:archive] + LOGGER.debug("Waiting for babeltrace_thapi and dirwatch (#{pids}) to finish") + pids.each do |pid| + Process.wait(pid) + raise "#{pid} failed" unless $?.success? + end + end # we can kill the session daemon lm_lttng_kill_sessiond # Preprocess trace - lm_babeltrace(backends) + lm_babeltrace(backends) unless OPTIONS[:archive] lm_move_to_shared end + # Global master rename the unique trace folder to a more + # human friendly name gm_rename_folder if mpi_master? end @@ -772,6 +830,7 @@ if __FILE__ == $PROGRAM_NAME parser.on('-b', '--backends BACKENDS', Array, "Select which and how backends' need to handled.", 'Format: backend_name[:backend_level],...', default: ['mpi:3', 'omp:2', 'cl:1', 'ze:1', 'cuda:1', 'hip:1']) + parser.on('--[no-]archive', 'Trigger for ardhive support', default: false) # Analysis parser.on('-r', '--replay [PATH]', 'Replay traces for post-mortem analysis.', From 41248b293b52bb4bb8d81640a479e9e25f639c1d Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Tue, 17 Sep 2024 14:55:57 -0500 Subject: [PATCH 4/4] Single rank profiling (#288) * Make only local master do energy profiling. * Use ZES to query devices in order to get around affinity masks. * Use ZES for drivers as well. * set ZES * Update ze/tracer_ze_helpers.include.c Co-authored-by: Brice Videau * Update ze/tracer_ze_helpers.include.c Co-authored-by: Brice Videau * Update xprof/xprof.rb.in --------- Co-authored-by: Brice Videau Co-authored-by: Thomas Applencourt --- xprof/xprof.rb.in | 10 +++++--- ze/tracer_ze_helpers.include.c | 46 ++++++++++++++++------------------ 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index b098e99b..0e91b714 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -332,8 +332,7 @@ end def sampling? return false unless OPTIONS[:sample] - - env_fetch_first('LTTNG_UST_SAMPLING_MASTER_ONLY', default: '0') == '0' || mpi_local_master? + env_fetch_first('LTTNG_UST_SAMPLING_MASTER_ONLY', default: '1') == '0' || mpi_local_master? end def env_tracers @@ -386,11 +385,16 @@ def env_tracers end # Sample + # Currently the same `so` does the tracing, and the sampling + # This mean that is the local rank is not part of the `traced-ranks` + # No sampling will be performed if sampling? LOGGER.debug('Sampling Enabled') h['LTTNG_UST_SAMPLING'] = 1 h['LTTNG_UST_SAMPLING_ENERGY'] = 1 - h['ZES_ENABLE_SYSMAN'] = 1 if OPTIONS[:'backend-names'].include?('ze') + # The current only reliable way to use zes api + # is to call zesInit and set ZES_ENABLE_SYSMAN to 0 + h['ZES_ENABLE_SYSMAN'] = 0 if OPTIONS[:'backend-names'].include?('ze') end backends = [] unless need_backend diff --git a/ze/tracer_ze_helpers.include.c b/ze/tracer_ze_helpers.include.c index 292e993f..164149a9 100644 --- a/ze/tracer_ze_helpers.include.c +++ b/ze/tracer_ze_helpers.include.c @@ -793,8 +793,8 @@ static int _sampling_freq_initialized = 0; static int _sampling_pwr_initialized = 0; static int _sampling_engines_initialized = 0; // Static handles to stay throughout the execution -static ze_driver_handle_t* _sampling_hDrivers = NULL; -static ze_device_handle_t** _sampling_hDevices = NULL; +static zes_driver_handle_t* _sampling_hDrivers = NULL; +static zes_device_handle_t** _sampling_hDevices = NULL; static zes_freq_handle_t*** _sampling_hFrequencies = NULL; static zes_pwr_handle_t*** _sampling_hPowers = NULL; static zes_engine_handle_t*** _sampling_engineHandles = NULL; @@ -909,58 +909,54 @@ static void intializeEngines() { static int initializeHandles() { ze_result_t res; - const char *e = getenv("ZES_ENABLE_SYSMAN"); - if (!(e && e[0] == '1')) { - fprintf(stderr,"ZES_ENABLE_SYSMAN needs to be set!\n"); - return -1; - } -#ifdef CALL_ZEINIT - res = zeInit(ZE_INIT_FLAG_GPU_ONLY); + res = ZES_INIT_PTR(0); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("zeInit", res); + _ZE_ERROR_MSG("ZES_INIT_PTR", res); return -1; } -#endif // Query driver _sampling_driverCount = 0; - res = ZE_DRIVER_GET_PTR(&_sampling_driverCount, NULL); + res = ZES_DRIVER_GET_PTR(&_sampling_driverCount, NULL); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("1st ZE_DRIVER_GET_PTR", res); + _ZE_ERROR_MSG("1st ZES_DRIVER_GET_PTR", res); return -1; } - _sampling_hDrivers = (ze_driver_handle_t*) calloc(_sampling_driverCount, sizeof(ze_driver_handle_t)); - res = ZE_DRIVER_GET_PTR(&_sampling_driverCount, _sampling_hDrivers); + _sampling_hDrivers = (zes_driver_handle_t*) calloc(_sampling_driverCount, sizeof(zes_driver_handle_t)); + res = ZES_DRIVER_GET_PTR(&_sampling_driverCount, _sampling_hDrivers); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("2nd ZE_DRIVER_GET_PTR", res); + _ZE_ERROR_MSG("2nd ZES_DRIVER_GET_PTR", res); return -1; } _sampling_deviceCount = (uint32_t*) calloc(_sampling_driverCount, sizeof(uint32_t)); _sampling_subDeviceCount = (uint32_t**) calloc(_sampling_driverCount, sizeof(uint32_t*)); - _sampling_hDevices = (ze_device_handle_t**) calloc(_sampling_driverCount, sizeof(ze_device_handle_t*)); + _sampling_hDevices = (zes_device_handle_t**) calloc(_sampling_driverCount, sizeof(zes_device_handle_t*)); // Query device count for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { - res = ZE_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], NULL); + res = ZES_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], NULL); if (res != ZE_RESULT_SUCCESS || _sampling_deviceCount[driverIdx] == 0) { fprintf(stderr, "ERROR: No device found!\n"); - _ZE_ERROR_MSG("1st ZE_DEVICE_GET_PTR", res); + _ZE_ERROR_MSG("1st ZES_DEVICE_GET_PTR", res); return -1; } - _sampling_hDevices[driverIdx] = (ze_device_handle_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(ze_device_handle_t)); - res = ZE_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], _sampling_hDevices[driverIdx]); + _sampling_hDevices[driverIdx] = (zes_device_handle_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(zes_device_handle_t)); + res = ZES_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], _sampling_hDevices[driverIdx]); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("2nd ZE_DEVICE_GET_PTR", res); + _ZE_ERROR_MSG("2nd ZES_DEVICE_GET_PTR", res); free(_sampling_hDevices[driverIdx]); return -1; } //Get no sub-devices _sampling_subDeviceCount[driverIdx] = (uint32_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(uint32_t)); for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { - res = ZE_DEVICE_GET_SUB_DEVICES_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_subDeviceCount[driverIdx][deviceIdx], NULL); + zes_device_properties_t deviceProperties = {0}; + deviceProperties.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES; + res = ZES_DEVICE_GET_PROPERTIES_PTR(_sampling_hDevices[driverIdx][deviceIdx], &deviceProperties); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("ZE_DEVICE_GET_SUB_DEVICES_PTR", res); + _ZE_ERROR_MSG("ZES_DEVICE_GET_PROPERTIES_PTR", res); _sampling_subDeviceCount[driverIdx][deviceIdx] = 0; - } + } else + _sampling_subDeviceCount[driverIdx][deviceIdx] = deviceProperties.numSubdevices; if (_sampling_subDeviceCount[driverIdx][deviceIdx] == 0) { _sampling_subDeviceCount[driverIdx][deviceIdx] = 1; }