From 1d23af6fec476acf1f1c07c6fb9a65527805d384 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Tue, 1 Oct 2024 10:33:34 -0500 Subject: [PATCH 1/8] Better error message when bin doesn't exist (#291) * better error message when bin doesn't exist * fix daemon not run --------- Co-authored-by: Thomas Applencourt --- xprof/xprof.rb.in | 51 +++++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index 9b115c23..0a2ed09a 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -307,12 +307,14 @@ class Sync_daemon # we always call clean-up the daemon def self.open yield f = new - rescue StandardError - raise + rescue Errno::ENOENT + exit(1) ensure - return unless f - f.global_barrier - f.finalize + # https://www.rubydoc.info/gems/rubocop/RuboCop/Cop/Lint/EnsureReturn + if f + f.global_barrier + f.finalize + end end end @@ -346,8 +348,7 @@ def env_tracers %w[ze ze libze_loader libTracerZE], %w[cuda cuda libcuda libTracerCUDA], %w[hip hip libamdhip64 libTracerHIP], - %w[mpi mpi libmpi libTracerMPI], - ].each do |name, bt_name, lib, libtracer| + %w[mpi mpi libmpi libTracerMPI]].each do |name, bt_name, lib, libtracer| # Backend requested, skip omp. It will be handled in a custom case bellow next unless OPTIONS[:'backend-names'].include?(bt_name) @@ -430,6 +431,9 @@ def launch_usr_bin(env, cmd) LOGGER.warn { 'Application Exited' } rescue Interrupt LOGGER.warn { 'Application Received Interrupt Signal' } + rescue Errno::ENOENT + warn("#{__FILE__}: Can't find executable #{cmd.first}") + raise Errno::ENOENT end end @@ -570,11 +574,13 @@ end def lm_lttng_teardown_session raise unless mpi_local_master? + exec("lttng destroy #{lttng_session_uuid}") end def lm_lttng_kill_sessiond raise unless mpi_local_master? + # Need to kill the sessiond Daemon. It's safe because each job has their own # # In theory, opening the lttng-sessiond.pid file is racy. @@ -650,7 +656,19 @@ end # Start, Stop lttng, amd do the on-node analsysis def trace_and_on_node_processing(usr_argv) - # Global barrier at exit + def teardown_lttng(syncd) + # We need to be sure that all the local ranks are finished + # before the local master stops the lttng session + syncd.local_barrier('waiting_for_application_ending') + return unless mpi_local_master? + + # Stop Lttng session + lm_lttng_teardown_session + # Lttng session is finished, + # we can kill the session daemon + lm_lttng_kill_sessiond + end + Sync_daemon.open do |syncd| # Load Tracers and APILoaders Lib backends, h = env_tracers @@ -661,19 +679,18 @@ def trace_and_on_node_processing(usr_argv) # Only local master spawn LTTNG daemon and start session lm_setup_lttng(backends) if mpi_local_master? syncd.local_barrier('waiting_for_lttng_setup') + # Launch User Command - launch_usr_bin(h, usr_argv) + begin + launch_usr_bin(h, usr_argv) + rescue Errno::ENOENT + teardown_lttng(syncd) + raise + end - # We need to be sure that all the local ranks are finished - # before the local master stops the lttng session - syncd.local_barrier('waiting_for_application_ending') + teardown_lttng(syncd) return unless mpi_local_master? - # Stop Lttng session - lm_lttng_teardown_session - # Lttng session is finished, - # we can kill the session daemon - lm_lttng_kill_sessiond # Preprocess trace lm_babeltrace(backends) lm_move_to_shared From 123017952af8c83c5069dde96c20a88d522886aa Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Tue, 1 Oct 2024 14:16:09 -0500 Subject: [PATCH 2/8] Improve Naming of SyncDaemon (#293) * improve naming * fix sync_daemon_fs --------- Co-authored-by: Thomas Applencourt --- xprof/sync_daemon_fs | 14 +++++++------- xprof/xprof.rb.in | 16 ++++++++-------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/xprof/sync_daemon_fs b/xprof/sync_daemon_fs index dea845b7..c04c0d10 100755 --- a/xprof/sync_daemon_fs +++ b/xprof/sync_daemon_fs @@ -61,19 +61,19 @@ global_handle = nil parent_pid = nil # Set trap -Signal.trap(Sync_daemon::RT_SIGNAL_GLOBAL_BARRIER) do +Signal.trap(SyncDaemon::RT_SIGNAL_GLOBAL_BARRIER) do global_barrier(global_handle) - Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid) + Process.kill(SyncDaemon::RT_SIGNAL_READY, parent_pid) end local_barier_count = 0 -Signal.trap(Sync_daemon::RT_SIGNAL_LOCAL_BARRIER) do +Signal.trap(SyncDaemon::RT_SIGNAL_LOCAL_BARRIER) do local_barier(local_barier_count.to_s) local_barier_count += 1 - Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid) + Process.kill(SyncDaemon::RT_SIGNAL_READY, parent_pid) end -Signal.trap(Sync_daemon::RT_SIGNAL_FINISH) do +Signal.trap(SyncDaemon::RT_SIGNAL_FINISH) do # We cannot delete SHARED_LOCAL_FILESYSTEM # Some rank can exit the `global_barier` (hence calling this function) # when others ranks are still in the `local_barrier` @@ -83,12 +83,12 @@ Signal.trap(Sync_daemon::RT_SIGNAL_FINISH) do # is to make all ranks busy_wait in the `global_barrier`. # This will ensure that every-one exited the `local_barrier`. # but given the poor performance of our FS, we will avoid that for now... - Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid) + Process.kill(SyncDaemon::RT_SIGNAL_READY, parent_pid) exit end # Init global barrier global_handle = init_global_barrier parent_pid = ARGV[0].to_i -Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid) +Process.kill(SyncDaemon::RT_SIGNAL_READY, parent_pid) sleep diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index 0a2ed09a..66396d42 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -233,7 +233,7 @@ end # |_) _. ._ ._ o _ ._ # |_) (_| | | | (/_ | # -class Sync_daemon +class SyncDaemon SIGRTMIN = 34 RT_SIGNAL_READY = SIGRTMIN RT_SIGNAL_GLOBAL_BARRIER = SIGRTMIN + 1 @@ -280,13 +280,13 @@ class Sync_daemon end LOGGER.debug { "spawn(#{daemon} #{Process.pid})" } - lazy_exec("Initialize Sync_daemon #{daemon_type}") do + lazy_exec("Initialize SyncDaemon #{daemon_type}") do @pid = spawn("#{daemon} #{Process.pid}") end end def finalize - lazy_exec('Finalize Sync_daemon') do + lazy_exec('Finalize SyncDaemon') do `kill -#{RT_SIGNAL_FINISH} #{@pid}` end end @@ -306,14 +306,14 @@ class Sync_daemon # Context manager, ensure that when the block yield is exited # we always call clean-up the daemon def self.open - yield f = new + yield syncd = new rescue Errno::ENOENT exit(1) ensure # https://www.rubydoc.info/gems/rubocop/RuboCop/Cop/Lint/EnsureReturn - if f - f.global_barrier - f.finalize + if syncd + syncd.global_barrier + syncd.finalize end end end @@ -669,7 +669,7 @@ def trace_and_on_node_processing(usr_argv) lm_lttng_kill_sessiond end - Sync_daemon.open do |syncd| + SyncDaemon.open do |syncd| # Load Tracers and APILoaders Lib backends, h = env_tracers From 8df83b0f41a19bc189c0865e4d67b3c931d4476a Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Tue, 1 Oct 2024 16:56:24 -0500 Subject: [PATCH 3/8] Forward exit code (#294) * Save exit code * used it * add test * fix no analysis * correct handling * Update xprof/xprof.rb.in --------- Co-authored-by: Thomas Applencourt Co-authored-by: Thomas Applencourt Co-authored-by: Brice Videau --- integration_tests/general.bats | 8 ++++++++ xprof/xprof.rb.in | 31 +++++++++++++++++++++++++------ 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/integration_tests/general.bats b/integration_tests/general.bats index fbeac370..b4784770 100644 --- a/integration_tests/general.bats +++ b/integration_tests/general.bats @@ -67,3 +67,11 @@ teardown_file() { [ "$status" != 0 ] rm out.pftrace } + +@test "exit_code_propagated" { + run $IPROF -- bash -c "exit 55" + [ "$status" == 55 ] + + run $IPROF --no-analysis -- bash -c "exit 55" + [ "$status" == 55 ] +} diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index 66396d42..e1a8e3a3 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -13,6 +13,18 @@ PREFIX = '@prefix@' DATAROOTDIR = File.join(PREFIX, 'share') DATADIR = DATAROOTDIR +class XprofExitCode + @@exit_code = 0 + def self.update(exit_code) + # Keep only the first error + @@exit_code = exit_code if @@exit_code == 0 + end + + def self.get + @@exit_code + end +end + $LOAD_PATH.unshift(DATADIR) if File.directory?(DATADIR) require 'open3' require 'fileutils' @@ -423,14 +435,17 @@ def launch_usr_bin(env, cmd) begin PTY.spawn(bash_env, *cmd) do |stdout, _stdin, _pid| + # Reading stdout will trigger Errno::EIO stdout.each { |line| print line } rescue Errno::EIO + # Wait for the PTY to finish, to set $? + Process.wait(_pid) + return $?.exitstatus end - # Not sure how this exception can be triggered - rescue PTY::ChildExited - LOGGER.warn { 'Application Exited' } rescue Interrupt LOGGER.warn { 'Application Received Interrupt Signal' } + # SigINT is 2 + 2 rescue Errno::ENOENT warn("#{__FILE__}: Can't find executable #{cmd.first}") raise Errno::ENOENT @@ -682,7 +697,7 @@ def trace_and_on_node_processing(usr_argv) # Launch User Command begin - launch_usr_bin(h, usr_argv) + XprofExitCode.update(launch_usr_bin(h, usr_argv)) rescue Errno::ENOENT teardown_lttng(syncd) raise @@ -739,7 +754,7 @@ def gm_processing(folder) fo.close end - exit(1) unless $?.success? + $?.exitstatus end # @@ -860,8 +875,12 @@ if __FILE__ == $PROGRAM_NAME # Right now, `replay` means no tracing. # But we don't have a way of disabling post-processing folder = OPTIONS.include?(:replay) ? OPTIONS[:replay] || last_trace_saved : trace_and_on_node_processing(ARGV) + if mpi_master? warn("THAPI: Trace location: #{folder}") - gm_processing(folder) if OPTIONS[:analysis] + XprofExitCode.update(gm_processing(folder)) if OPTIONS[:analysis] end + + exit(XprofExitCode.get) + end From a8dfacd19ed05a1377a794af0b409a325bfe558e Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Fri, 11 Oct 2024 12:09:03 -0500 Subject: [PATCH 4/8] fix_ze_soname (#296) Co-authored-by: Thomas Applencourt --- xprof/xprof.rb.in | 2 +- ze/Makefile.am | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index e1a8e3a3..64b2017d 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -357,7 +357,7 @@ def env_tracers backends = [] [%w[opencl cl libOpenCL libTracerOpenCL], - %w[ze ze libze_loader libTracerZE], + %w[ze ze libze_loader libze_loader], %w[cuda cuda libcuda libTracerCUDA], %w[hip hip libamdhip64 libTracerHIP], %w[mpi mpi libmpi libTracerMPI]].each do |name, bt_name, lib, libtracer| diff --git a/ze/Makefile.am b/ze/Makefile.am index 20d14200..19ee789f 100644 --- a/ze/Makefile.am +++ b/ze/Makefile.am @@ -168,21 +168,22 @@ libzetracepoints_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/inclu libzetracepoints_la_CFLAGS = -fPIC -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Wno-sign-compare $(WERROR) $(LTTNG_UST_CFLAGS) libzetracepoints_la_LDFLAGS = $(LTTNG_UST_LIBS) -lib_LTLIBRARIES = libTracerZE.la libZEInterval.la +lib_LTLIBRARIES = libze_loader.la libZEInterval.la -nodist_libTracerZE_la_SOURCES = \ +nodist_libze_loader_la_SOURCES = \ $(ZE_PROBES_INCL) \ $(ZE_STATIC_PROBES_INCL) \ tracer_ze.c -libTracerZE_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I$(top_srcdir)/sampling -I$(srcdir)/include -I$(top_srcdir)/utils -I./ -libTracerZE_la_CFLAGS = -Wall -Wextra $(WERROR) $(LIBFFI_CFLAGS) $(LTTNG_UST_CFLAGS) -libTracerZE_la_LDFLAGS = $(LTTNG_UST_LIBS) -ldl -lpthread $(LIBFFI_LIBS) ../sampling/libThapiSampling.la -libTracerZE_la_LIBADD = libzetracepoints.la +libze_loader_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I$(top_srcdir)/sampling -I$(srcdir)/include -I$(top_srcdir)/utils -I./ +libze_loader_la_CFLAGS = -Wall -Wextra $(WERROR) $(LIBFFI_CFLAGS) $(LTTNG_UST_CFLAGS) +libze_loader_la_LDFLAGS = $(LTTNG_UST_LIBS) -ldl -lpthread $(LIBFFI_LIBS) ../sampling/libThapiSampling.la +libze_loader_la_LDFLAGS += -version-info 1:0:0 +libze_loader_la_LIBADD = libzetracepoints.la install-exec-hook: $(MKDIR_P) $(DESTDIR)$(pkglibdir)/ze - $(LN_S) -f $(DESTDIR)$(libdir)/libTracerZE.so.0.0.0 $(DESTDIR)$(pkglibdir)/ze/libze_loader.so.1 + $(LN_S) -f $(DESTDIR)$(libdir)/libze_loader.so.1.0.0 $(DESTDIR)$(pkglibdir)/ze/libze_loader.so.1 $(LN_S) -f $(DESTDIR)$(pkglibdir)/ze/libze_loader.so.1 $(DESTDIR)$(pkglibdir)/ze/libze_loader.so $(MKDIR_P) $(DESTDIR)$(pkglibdir)/bt2 $(LN) -f $(DESTDIR)$(libdir)/libZEInterval.so $(DESTDIR)$(pkglibdir)/bt2/libZEInterval.so From c5436f2bf343b875207d73bde8935bad8a010d31 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Fri, 11 Oct 2024 13:04:32 -0500 Subject: [PATCH 5/8] fix tracer ze (#297) Co-authored-by: Thomas Applencourt --- ze/tracer_ze.sh.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ze/tracer_ze.sh.in b/ze/tracer_ze.sh.in index 42263c7e..0185ac43 100644 --- a/ze/tracer_ze.sh.in +++ b/ze/tracer_ze.sh.in @@ -126,7 +126,7 @@ then else export LD_LIBRARY_PATH=$pkglibdir/ze:$LD_LIBRARY_PATH fi -export LD_PRELOAD=$libdir/libTracerZE.so:$LD_PRELOAD +export LD_PRELOAD=$libdir/libze_loader.so:$LD_PRELOAD export LTTNG_UST_ALLOW_BLOCKING=1 export LTTNG_UST_ZE_VERBOSE=1 lttng start From 9db274d65ba9f120524445da0f1bd4a5445b7be2 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Wed, 4 Sep 2024 13:22:14 -0500 Subject: [PATCH 6/8] Dependency update (#286) Use dependency from Efficios deliverable. --------- Co-authored-by: Thomas Applencourt --- .github/workflows/presubmit.yml | 203 ++++++++++++++++++-------------- ze/Makefile.am | 6 +- 2 files changed, 122 insertions(+), 87 deletions(-) diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml index 0901bb69..6ec61195 100644 --- a/.github/workflows/presubmit.yml +++ b/.github/workflows/presubmit.yml @@ -1,6 +1,9 @@ name: Presubmit on: [push, pull_request] +env: + APT_PACKAGE: gcc g++ ruby ruby-dev elfutils libelf-dev libpopt-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind libglib2.0-dev libnuma-dev liburcu-dev + jobs: pre_job: runs-on: ubuntu-24.04 @@ -15,65 +18,93 @@ jobs: paths_ignore: '["**/README.md"]' do_not_skip: '["pull_request"]' - babeltrace2: + efficios_dep: needs: pre_job if: ${{ needs.pre_job.outputs.should_skip != 'true' }} - name: Build and cache Babeltrace2 + name: Build and Cache Efficios Dependencies runs-on: ubuntu-24.04 steps: - uses: actions/cache@v4 - id: babeltrace2 + id: efficios_dep env: - cache-name: cache-babeltrace2 + cache-name: cache-efficios_dep with: - path: ~/babeltrace2/2.0.5 + path: ~/efficios_dep/ key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler libglib2.0-dev - if: steps.babeltrace2.outputs.cache-hit != 'true' - - run: wget https://www.efficios.com/files/babeltrace/babeltrace2-2.0.5.tar.bz2 - if: steps.babeltrace2.outputs.cache-hit != 'true' - - run: tar -xjvf babeltrace2-2.0.5.tar.bz2 - if: steps.babeltrace2.outputs.cache-hit != 'true' - - run: | - wget https://github.com/argonne-lcf/THAPI-spack/raw/main/packages/babeltrace2/d2d2e6cc.patch - patch -p1 < d2d2e6cc.patch - if: steps.babeltrace2.outputs.cache-hit != 'true' - working-directory: babeltrace2-2.0.5 - - run: mkdir -p babeltrace2-2.0.5/build - if: steps.babeltrace2.outputs.cache-hit != 'true' - - run: ../configure --prefix=$HOME/babeltrace2/2.0.5 - if: steps.babeltrace2.outputs.cache-hit != 'true' - working-directory: babeltrace2-2.0.5/build - - run: make -j - if: steps.babeltrace2.outputs.cache-hit != 'true' - working-directory: babeltrace2-2.0.5/build - - run: make -j install - if: steps.babeltrace2.outputs.cache-hit != 'true' - working-directory: babeltrace2-2.0.5/build + - name: Set PKG_CONFIG + run: | + echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV + if: steps.efficios_dep.outputs.cache-hit != 'true' + - run: sudo apt update; sudo apt install -y $APT_PACKAGE + if: steps.efficios_dep.outputs.cache-hit != 'true' + # lttng-ust + - run: git clone https://github.com/lttng/lttng-ust + if: steps.efficios_dep.outputs.cache-hit != 'true' + - name: Install lttng-ust + run: | + # Avoid https://github.com/lttng/lttng-ust/commit/b187bcd5d99cde54dececee0e5028524d55aa314 who change the signature of + # lttng_ust_ctl_recv_register_event used by lttng-tool anl-ms3 + git checkout 4f8afc535e77070f1ef00434674f0417c6f9ef69 + ./bootstrap + ./configure --disable-man-pages --prefix=$HOME/efficios_dep/ + make -j$(nproc) + make install + working-directory: lttng-ust + if: steps.efficios_dep.outputs.cache-hit != 'true' + # lttng-tools need lttng-ust 2.14+ + - run: git clone -b anl-ms3 git://git.efficios.com/deliverable/lttng-tools.git + if: steps.efficios_dep.outputs.cache-hit != 'true' + - name: Install lttng-tools + run: | + ./bootstrap + ./configure --disable-man-pages --disable-bin-lttng-crash --prefix=$HOME/efficios_dep + make -j$(nproc) + make install + #Todo, this will need to be added in the spack repo as a patch + echo "#!/usr/bin/env python"| cat - dirwatch.py > $HOME/efficios_dep/bin/dirwatch.py + chmod 755 $HOME/efficios_dep/bin/dirwatch.py + working-directory: lttng-tools + if: steps.efficios_dep.outputs.cache-hit != 'true' + # babeltrace + - run: git clone -b anl-ms3 git://git.efficios.com/deliverable/babeltrace.git + if: steps.efficios_dep.outputs.cache-hit != 'true' + - name: Install Babeltrace + run: | + #Todo, grab file from Spack + wget https://raw.githubusercontent.com/argonne-lcf/THAPI/53262fcaaaf45d7d475884d7e63b69abe47e41d6/.github/workflows/str_nullptr.patch + patch -p1 < str_nullptr.patch + wget https://raw.githubusercontent.com/argonne-lcf/THAPI/4418916620496fd66cde0b3d5e241bed0a4c18a3/.github/workflows/bt_makefile.patch + patch -p1 < bt_makefile.patch + ./bootstrap + ./configure --disable-man-pages --prefix=$HOME/efficios_dep + make -j$(nproc) + make install + working-directory: babeltrace + if: steps.efficios_dep.outputs.cache-hit != 'true' build-and-check: - needs: [babeltrace2, pre_job] + needs: [efficios_dep, pre_job] if: ${{ needs.pre_job.outputs.should_skip != 'true' }} name: Build and Check ubuntu-24.04 runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - uses: actions/cache@v4 - id: babeltrace2 + id: efficios_dep env: - cache-name: cache-babeltrace2 + cache-name: cache-efficios_dep with: - path: ~/babeltrace2/2.0.5 + path: ~/efficios_dep/ key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind libglib2.0-dev + - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | - echo "$HOME/babeltrace2/2.0.5/bin" >> $GITHUB_PATH - echo "PKG_CONFIG_PATH=$HOME/babeltrace2/2.0.5/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV - echo "CPATH=$HOME/babeltrace2/2.0.5/include:$CPATH" >> $GITHUB_ENV - echo "LD_LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV - echo "LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LIBRARY_PATH" >> $GITHUB_ENV + echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH + echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV + echo "CPATH=$HOME/efficios_dep/include:$CPATH" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=$HOME/efficios_dep/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV + echo "LIBRARY_PATH=$HOME/efficios_dep/lib:$LIBRARY_PATH" >> $GITHUB_ENV - run: mkdir -p build - run: ./autogen.sh - run: ../configure @@ -94,7 +125,7 @@ jobs: build/**/tests/*.log install-with-mpi: - needs: [babeltrace2, pre_job] + needs: [efficios_dep, pre_job] if: ${{ needs.pre_job.outputs.should_skip != 'true' }} name: Install with MPI daemon support runs-on: ubuntu-24.04 @@ -102,23 +133,23 @@ jobs: - uses: actions/checkout@v4 - uses: mpi4py/setup-mpi@v1 with: - mpi: intelmpi + mpi: intelmpi - uses: actions/cache@v4 - id: babeltrace2 + id: efficios_dep env: - cache-name: cache-babeltrace2 + cache-name: cache-efficios_dep with: - path: ~/babeltrace2/2.0.5 + path: ~/efficios_dep/ key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler libglib2.0-dev + - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | - echo "$HOME/babeltrace2/2.0.5/bin" >> $GITHUB_PATH - echo "PKG_CONFIG_PATH=$HOME/babeltrace2/2.0.5/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV - echo "CPATH=$HOME/babeltrace2/2.0.5/include:$CPATH" >> $GITHUB_ENV - echo "LD_LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV - echo "LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LIBRARY_PATH" >> $GITHUB_ENV + echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH + echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV + echo "CPATH=$HOME/efficios_dep/include:$CPATH" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=$HOME/efficios_dep/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV + echo "LIBRARY_PATH=$HOME/efficios_dep/lib:$LIBRARY_PATH" >> $GITHUB_ENV - run: mkdir -p build - run: ./autogen.sh - run: ../configure --prefix=`pwd`/ici @@ -133,7 +164,7 @@ jobs: path: thapi.tar integration-tests: - needs: [babeltrace2, pre_job, install-with-mpi] + needs: [efficios_dep, pre_job, install-with-mpi] if: ${{ needs.pre_job.outputs.should_skip != 'true' }} name: Integration ${{ matrix.bats_file }} ${{matrix.thapi_sync_daemon }} runs-on: ubuntu-24.04 @@ -146,50 +177,50 @@ jobs: with: mpi: intelmpi - uses: actions/cache@v4 - id: babeltrace2 + id: efficios_dep env: - cache-name: cache-babeltrace2 + cache-name: cache-efficios_dep with: - path: ~/babeltrace2/2.0.5 + path: ~/efficios_dep/ key: ${{ runner.os }}-build-${{ env.cache-name }} - uses: actions/download-artifact@v4 with: name: thapi-bin - name: Untar THAPI run: tar -xvf thapi.tar - - run: sudo apt update; sudo apt install -y lttng-tools liblttng-ust-dev ruby ruby-dev libprotobuf-dev libpocl2 clinfo bats coreutils libglib2.0-dev + - run: sudo apt update; sudo apt install -y $APT_PACKAGE bats coreutils libpocl2 clinfo - run: sudo gem install babeltrace2 opencl_ruby_ffi - name: Load Babeltrace2 run: | - echo "$HOME/babeltrace2/2.0.5/bin" >> $GITHUB_PATH - echo "LD_LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV + echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH + echo "LD_LIBRARY_PATH=$HOME/efficios_dep/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV - name: Integration test run: | bats integration_tests/ - build-in-tree: - needs: [babeltrace2, pre_job] + build-in-tree-and-check: + needs: [efficios_dep, pre_job] if: ${{ needs.pre_job.outputs.should_skip != 'true' }} name: Build in Tree ubuntu-24.04 runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - uses: actions/cache@v4 - id: babeltrace2 + id: efficios_dep env: - cache-name: cache-babeltrace2 + cache-name: cache-efficios_dep with: - path: ~/babeltrace2/2.0.5 + path: ~/efficios_dep/ key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind libglib2.0-dev + - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | - echo "$HOME/babeltrace2/2.0.5/bin" >> $GITHUB_PATH - echo "PKG_CONFIG_PATH=$HOME/babeltrace2/2.0.5/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV - echo "CPATH=$HOME/babeltrace2/2.0.5/include:$CPATH" >> $GITHUB_ENV - echo "LD_LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV - echo "LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LIBRARY_PATH" >> $GITHUB_ENV + echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH + echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV + echo "CPATH=$HOME/efficios_dep/include:$CPATH" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=$HOME/efficios_dep/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV + echo "LIBRARY_PATH=$HOME/efficios_dep/lib:$LIBRARY_PATH" >> $GITHUB_ENV - run: ./autogen.sh - run: ./configure - run: make -j @@ -206,28 +237,28 @@ jobs: ./**/tests/*.log distcheck: - needs: [babeltrace2, pre_job] + needs: [efficios_dep, pre_job] if: ${{ needs.pre_job.outputs.should_skip != 'true' }} name: Distcheck ubuntu-24.04 runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - uses: actions/cache@v4 - id: babeltrace2 + id: efficios_dep env: - cache-name: cache-babeltrace2 + cache-name: cache-efficios_dep with: - path: ~/babeltrace2/2.0.5 + path: ~/efficios_dep/ key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind libglib2.0-dev + - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | - echo "$HOME/babeltrace2/2.0.5/bin" >> $GITHUB_PATH - echo "PKG_CONFIG_PATH=$HOME/babeltrace2/2.0.5/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV - echo "CPATH=$HOME/babeltrace2/2.0.5/include:$CPATH" >> $GITHUB_ENV - echo "LD_LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV - echo "LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LIBRARY_PATH" >> $GITHUB_ENV + echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH + echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV + echo "CPATH=$HOME/efficios_dep/include:$CPATH" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=$HOME/efficios_dep/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV + echo "LIBRARY_PATH=$HOME/efficios_dep/lib:$LIBRARY_PATH" >> $GITHUB_ENV - run: mkdir -p build - run: ./autogen.sh - run: ../configure @@ -238,28 +269,28 @@ jobs: THAPI_VALGRIND: 1 dist-check: - needs: [babeltrace2, pre_job] + needs: [efficios_dep, pre_job] if: ${{ needs.pre_job.outputs.should_skip != 'true' }} name: Dist and Check ubuntu-24.04 runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - uses: actions/cache@v4 - id: babeltrace2 + id: efficios_dep env: - cache-name: cache-babeltrace2 + cache-name: cache-efficios_dep with: - path: ~/babeltrace2/2.0.5 + path: ~/efficios_dep/ key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind libglib2.0-dev + - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | - echo "$HOME/babeltrace2/2.0.5/bin" >> $GITHUB_PATH - echo "PKG_CONFIG_PATH=$HOME/babeltrace2/2.0.5/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV - echo "CPATH=$HOME/babeltrace2/2.0.5/include:$CPATH" >> $GITHUB_ENV - echo "LD_LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV - echo "LIBRARY_PATH=$HOME/babeltrace2/2.0.5/lib:$LIBRARY_PATH" >> $GITHUB_ENV + echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH + echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV + echo "CPATH=$HOME/efficios_dep/include:$CPATH" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=$HOME/efficios_dep/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV + echo "LIBRARY_PATH=$HOME/efficios_dep/lib:$LIBRARY_PATH" >> $GITHUB_ENV - run: mkdir -p build - run: ./autogen.sh - run: ../configure diff --git a/ze/Makefile.am b/ze/Makefile.am index 19ee789f..60f88f30 100644 --- a/ze/Makefile.am +++ b/ze/Makefile.am @@ -1,7 +1,11 @@ .DELETE_ON_ERROR: if STRICT - WERROR = -Werror + # We disable `nonnull` check due to + # ././ze_tracepoints.h: In function 'lttng_ust__event_get_size__lttng_ust_ze___zeModuleCreate_entry': + # [...]/lttng/ust-tracepoint-event.h:578:17: error: argument 1 null where non-null expected [-Werror=nonnull] + # 578 | strlen((_src) ? (_src) : LTTNG_UST__NULL_STRING) + 1; + WERROR = -Werror -Wno-error=nonnull else WERROR = endif From 1653288a3dfd589db30b4e97f7e395546060d5dd Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Wed, 4 Sep 2024 15:36:13 -0500 Subject: [PATCH 7/8] Add archive (#287) Enable usage of session rotation for lossless online trace consumption. --------- Co-authored-by: Thomas Applencourt --- .github/workflows/presubmit.yml | 8 ++-- integration_tests/general.bats | 4 ++ utils/babeltrace_thapi.in | 9 ++++ xprof/xprof.rb.in | 75 ++++++++++++++++++++++++++++----- 4 files changed, 81 insertions(+), 15 deletions(-) diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml index 6ec61195..58eaa44b 100644 --- a/.github/workflows/presubmit.yml +++ b/.github/workflows/presubmit.yml @@ -190,7 +190,7 @@ jobs: run: tar -xvf thapi.tar - run: sudo apt update; sudo apt install -y $APT_PACKAGE bats coreutils libpocl2 clinfo - run: sudo gem install babeltrace2 opencl_ruby_ffi - - name: Load Babeltrace2 + - name: Load Efficios Dependencies run: | echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH echo "LD_LIBRARY_PATH=$HOME/efficios_dep/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV @@ -214,7 +214,7 @@ jobs: key: ${{ runner.os }}-build-${{ env.cache-name }} - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - - name: Load Babeltrace2 + - name: Load Efficios Dependencies run: | echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV @@ -252,7 +252,7 @@ jobs: key: ${{ runner.os }}-build-${{ env.cache-name }} - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - - name: Load Babeltrace2 + - name: Load Efficios Dependencies run: | echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV @@ -284,7 +284,7 @@ jobs: key: ${{ runner.os }}-build-${{ env.cache-name }} - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - - name: Load Babeltrace2 + - name: Load Efficios Dependencie run: | echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV diff --git a/integration_tests/general.bats b/integration_tests/general.bats index b4784770..e3aeed4a 100644 --- a/integration_tests/general.bats +++ b/integration_tests/general.bats @@ -22,6 +22,10 @@ teardown_file() { rm out.pftrace } +@test "archive_summary" { + $IPROF --archive $THAPI_TEST_BIN +} + @test "replay_summary" { $IPROF $THAPI_TEST_BIN $IPROF -r diff --git a/utils/babeltrace_thapi.in b/utils/babeltrace_thapi.in index fe344861..08125a0a 100755 --- a/utils/babeltrace_thapi.in +++ b/utils/babeltrace_thapi.in @@ -130,6 +130,7 @@ def get_components(names) components_classes = { 'source.ctf.fs' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('fs'), 'source.ctf.lttng_live' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('lttng-live'), + 'source.ctf.lttng_archive' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('lttng-archive'), 'filter.utils.muxer' => BT2::BTPlugin.find('utils').get_filter_component_class_by_name('muxer'), 'sink.text.pretty' => BT2::BTPlugin.find('text').get_sink_component_class_by_name('pretty'), 'sink.ctf.fs' => BT2::BTPlugin.find('ctf').get_sink_component_class_by_name('fs'), @@ -200,6 +201,10 @@ def get_and_add_components(graph, names, l_inputs) graph.add(comp, 'source_live', params: { 'inputs' => $options[:inputs], 'session-not-found-action' => 'end' }) + when 'source.ctf.lttng_archive' + graph.add(comp, 'source_archive', + params: { 'session-name' => $options[:archive], + 'session-found-file-path' => $options[:'archive-session-found-file-path'] }) when 'source.ctf.fs' s = Find.find(*l_inputs) .reject { |path| FileTest.directory?(path) } @@ -281,6 +286,8 @@ def bt_graphs(inputs) @bt_graphs[inputs] ||= begin g_comps = [if $options[:live] 'source.ctf.lttng_live' + elsif $options[:archive] + 'source.ctf.lttng_archive' else 'source.ctf.fs' end] @@ -354,6 +361,8 @@ class BabeltraceParserThapi < OptionParserWithDefaultAndValidation 'Format: backend_name[:backend_level],...', default: ['mpi:3', 'omp:2', 'cl:1', 'ze:1', 'cuda:1', 'hip:1']) on('--debug', default: false) + on('--archive SESSION-NAME') + on('--archive-session-found-file-path PATH') on('--[no-]muxer') on('-v', '--version', 'Print the version string') do puts File.read(File.join(DATADIR, 'version')) diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index 64b2017d..926f9e3c 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -13,6 +13,11 @@ PREFIX = '@prefix@' DATAROOTDIR = File.join(PREFIX, 'share') DATADIR = DATAROOTDIR +LTTNG_ARCHIVE_SIZE = '50M' +LTTNG_ARCHIVE_TIMER = '60s' +LTTNG_DIRWATCH_SIZE = '500' # In MiB +LTTNG_DIRWATCH_LOCK_RETRY_DELAY = 0.1 + class XprofExitCode @@exit_code = 0 def self.update(exit_code) @@ -584,12 +589,18 @@ def lm_setup_lttng(backends) end end + # This is required to force the creation of an trace, + # so that dirwatch doesn't complain about empty trace + if OPTIONS[:archive] + exec("lttng enable-rotation --session=#{lttng_session_uuid} --size=#{LTTNG_ARCHIVE_SIZE} --timer=#{LTTNG_ARCHIVE_TIMER}") + end exec("lttng start #{lttng_session_uuid}") end def lm_lttng_teardown_session raise unless mpi_local_master? + exec("lttng rotate #{lttng_session_uuid}") if OPTIONS[:archive] exec("lttng destroy #{lttng_session_uuid}") end @@ -622,7 +633,27 @@ def lm_babeltrace(backends) opts << "--output #{thapi_trace_dir_tmp}" opts << "--backends #{backends.join(',')}" opts << '--no-discard-metadata' if type == 'aggreg' && OPTIONS.include?(:'kernel-verbose') - exec("#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}") + + if OPTIONS[:archive] + read_file = File.join(lttng_trace_dir_tmp, 'bt_archive_ready') + opts << "--archive #{lttng_session_uuid} --archive-session-found-file-path=#{read_file}" + cmd = "#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}" + LOGGER.debug(cmd) + pid_bt = spawn(cmd) + + cmd = "dirwatch.py --log-level=CRITICAL #{lttng_session_uuid} #{LTTNG_DIRWATCH_SIZE}" + LOGGER.debug(cmd) + pid_dirwatch = spawn(cmd) + + until File.exist?(read_file) + # Ensure that dirwatch.py didn't crash, and deadlock + Process.wait(pid_dirwatch, Process::WNOHANG) + sleep(LTTNG_DIRWATCH_LOCK_RETRY_DELAY) + end + [pid_bt, pid_dirwatch] + else + exec("#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}") + end end # _ @@ -630,7 +661,7 @@ end # | | (_) (_ (/_ _> _> | | | (_| # _| -# Some naming convension +# Some naming convention # lm == function executed only local_master # gm == function executed only global_master @@ -640,6 +671,11 @@ def lm_move_to_shared if OPTIONS.include?(:trace) || !OPTIONS[:analysis] # The Apps finished, lttng finished, need to move to the shared tmp folder FileUtils.mkdir_p(File.dirname(thapi_trace_dir_tmp)) + # NOTE: I don't understand `mv` + # File.mv(a, b) will put a into b (aka a/b) + # FileUtils.rename(a,b) will move a as b, but may + # raise Invalid cross-device error. + # So we use `exec(mv -T a b)`, this have the added benefice of logging exec("mv #{lttng_trace_dir_tmp} #{thapi_trace_dir_tmp}") else # `lm_babeltrace` finished, can remove `tmp` folder @@ -655,7 +691,7 @@ def gm_rename_folder # Replace it with a better name, and update the root metadata. thapi_trace_dir_tmp_root = File.dirname(thapi_trace_dir_tmp) - # Because of `traced-rank`, `mpi_master` may not have any trace avalaible, + # Because of `traced-rank`, `mpi_master` may not have any trace available, # so find the first hostname who have a metadata FileUtils.cp(Dir.glob("#{thapi_trace_dir_tmp_root}/*/thapi_metadata.yaml").first, File.join(thapi_trace_dir_tmp_root, 'thapi_metadata.yaml')) @@ -671,15 +707,25 @@ end # Start, Stop lttng, amd do the on-node analsysis def trace_and_on_node_processing(usr_argv) - def teardown_lttng(syncd) + def teardown_lttng(syncd, pids) # We need to be sure that all the local ranks are finished - # before the local master stops the lttng session syncd.local_barrier('waiting_for_application_ending') + + # Everything from now on, is some local-master processing + # The `Sync_daemon` context will handle the call to the global barrier + # for the early exiting ranks return unless mpi_local_master? - # Stop Lttng session + # Stop Lttng session and babeltrace daemons lm_lttng_teardown_session - # Lttng session is finished, + if OPTIONS[:archive] + LOGGER.debug("Waiting for babeltrace_thapi and dirwatch (#{pids}) to finish") + pids.each do |pid| + Process.wait(pid) + XprofExitCode.update($?.exitstatus) + Logger.warn("#{pid} failed") unless $?.success? + end + end # we can kill the session daemon lm_lttng_kill_sessiond end @@ -692,24 +738,30 @@ def trace_and_on_node_processing(usr_argv) # so they can have access to the daemon ENV['LTTNG_HOME'] = lttng_home_dir # Only local master spawn LTTNG daemon and start session - lm_setup_lttng(backends) if mpi_local_master? + pids = if mpi_local_master? + lm_setup_lttng(backends) + lm_babeltrace(backends) if OPTIONS[:archive] + end + syncd.local_barrier('waiting_for_lttng_setup') # Launch User Command begin XprofExitCode.update(launch_usr_bin(h, usr_argv)) rescue Errno::ENOENT - teardown_lttng(syncd) + teardown_lttng(syncd, pids) raise end - teardown_lttng(syncd) + teardown_lttng(syncd, pids) return unless mpi_local_master? # Preprocess trace - lm_babeltrace(backends) + lm_babeltrace(backends) unless OPTIONS[:archive] lm_move_to_shared end + # Global master rename the unique trace folder to a more + # human friendly name gm_rename_folder if mpi_master? end @@ -804,6 +856,7 @@ if __FILE__ == $PROGRAM_NAME parser.on('-b', '--backends BACKENDS', Array, "Select which and how backends' need to handled.", 'Format: backend_name[:backend_level],...', default: ['mpi:3', 'omp:2', 'cl:1', 'ze:1', 'cuda:1', 'hip:1']) + parser.on('--[no-]archive', 'Trigger for ardhive support', default: false) # Analysis parser.on('-r', '--replay [PATH]', 'Replay traces for post-mortem analysis.', From b970fb3f9f795fecd3750ba149cf078f735187d0 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Tue, 17 Sep 2024 14:55:57 -0500 Subject: [PATCH 8/8] Single rank profiling (#288) * Make only local master do energy profiling. * Use ZES to query devices in order to get around affinity masks. * Use ZES for drivers as well. * set ZES * Update ze/tracer_ze_helpers.include.c Co-authored-by: Brice Videau * Update ze/tracer_ze_helpers.include.c Co-authored-by: Brice Videau * Update xprof/xprof.rb.in --------- Co-authored-by: Brice Videau Co-authored-by: Thomas Applencourt --- xprof/xprof.rb.in | 10 +++++--- ze/tracer_ze_helpers.include.c | 46 ++++++++++++++++------------------ 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index 926f9e3c..0ca9e03f 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -345,8 +345,7 @@ end def sampling? return false unless OPTIONS[:sample] - - env_fetch_first('LTTNG_UST_SAMPLING_MASTER_ONLY', default: '0') == '0' || mpi_local_master? + env_fetch_first('LTTNG_UST_SAMPLING_MASTER_ONLY', default: '1') == '0' || mpi_local_master? end def env_tracers @@ -398,11 +397,16 @@ def env_tracers end # Sample + # Currently the same `so` does the tracing, and the sampling + # This mean that is the local rank is not part of the `traced-ranks` + # No sampling will be performed if sampling? LOGGER.debug('Sampling Enabled') h['LTTNG_UST_SAMPLING'] = 1 h['LTTNG_UST_SAMPLING_ENERGY'] = 1 - h['ZES_ENABLE_SYSMAN'] = 1 if OPTIONS[:'backend-names'].include?('ze') + # The current only reliable way to use zes api + # is to call zesInit and set ZES_ENABLE_SYSMAN to 0 + h['ZES_ENABLE_SYSMAN'] = 0 if OPTIONS[:'backend-names'].include?('ze') end backends = [] unless need_backend diff --git a/ze/tracer_ze_helpers.include.c b/ze/tracer_ze_helpers.include.c index 292e993f..164149a9 100644 --- a/ze/tracer_ze_helpers.include.c +++ b/ze/tracer_ze_helpers.include.c @@ -793,8 +793,8 @@ static int _sampling_freq_initialized = 0; static int _sampling_pwr_initialized = 0; static int _sampling_engines_initialized = 0; // Static handles to stay throughout the execution -static ze_driver_handle_t* _sampling_hDrivers = NULL; -static ze_device_handle_t** _sampling_hDevices = NULL; +static zes_driver_handle_t* _sampling_hDrivers = NULL; +static zes_device_handle_t** _sampling_hDevices = NULL; static zes_freq_handle_t*** _sampling_hFrequencies = NULL; static zes_pwr_handle_t*** _sampling_hPowers = NULL; static zes_engine_handle_t*** _sampling_engineHandles = NULL; @@ -909,58 +909,54 @@ static void intializeEngines() { static int initializeHandles() { ze_result_t res; - const char *e = getenv("ZES_ENABLE_SYSMAN"); - if (!(e && e[0] == '1')) { - fprintf(stderr,"ZES_ENABLE_SYSMAN needs to be set!\n"); - return -1; - } -#ifdef CALL_ZEINIT - res = zeInit(ZE_INIT_FLAG_GPU_ONLY); + res = ZES_INIT_PTR(0); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("zeInit", res); + _ZE_ERROR_MSG("ZES_INIT_PTR", res); return -1; } -#endif // Query driver _sampling_driverCount = 0; - res = ZE_DRIVER_GET_PTR(&_sampling_driverCount, NULL); + res = ZES_DRIVER_GET_PTR(&_sampling_driverCount, NULL); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("1st ZE_DRIVER_GET_PTR", res); + _ZE_ERROR_MSG("1st ZES_DRIVER_GET_PTR", res); return -1; } - _sampling_hDrivers = (ze_driver_handle_t*) calloc(_sampling_driverCount, sizeof(ze_driver_handle_t)); - res = ZE_DRIVER_GET_PTR(&_sampling_driverCount, _sampling_hDrivers); + _sampling_hDrivers = (zes_driver_handle_t*) calloc(_sampling_driverCount, sizeof(zes_driver_handle_t)); + res = ZES_DRIVER_GET_PTR(&_sampling_driverCount, _sampling_hDrivers); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("2nd ZE_DRIVER_GET_PTR", res); + _ZE_ERROR_MSG("2nd ZES_DRIVER_GET_PTR", res); return -1; } _sampling_deviceCount = (uint32_t*) calloc(_sampling_driverCount, sizeof(uint32_t)); _sampling_subDeviceCount = (uint32_t**) calloc(_sampling_driverCount, sizeof(uint32_t*)); - _sampling_hDevices = (ze_device_handle_t**) calloc(_sampling_driverCount, sizeof(ze_device_handle_t*)); + _sampling_hDevices = (zes_device_handle_t**) calloc(_sampling_driverCount, sizeof(zes_device_handle_t*)); // Query device count for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { - res = ZE_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], NULL); + res = ZES_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], NULL); if (res != ZE_RESULT_SUCCESS || _sampling_deviceCount[driverIdx] == 0) { fprintf(stderr, "ERROR: No device found!\n"); - _ZE_ERROR_MSG("1st ZE_DEVICE_GET_PTR", res); + _ZE_ERROR_MSG("1st ZES_DEVICE_GET_PTR", res); return -1; } - _sampling_hDevices[driverIdx] = (ze_device_handle_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(ze_device_handle_t)); - res = ZE_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], _sampling_hDevices[driverIdx]); + _sampling_hDevices[driverIdx] = (zes_device_handle_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(zes_device_handle_t)); + res = ZES_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], _sampling_hDevices[driverIdx]); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("2nd ZE_DEVICE_GET_PTR", res); + _ZE_ERROR_MSG("2nd ZES_DEVICE_GET_PTR", res); free(_sampling_hDevices[driverIdx]); return -1; } //Get no sub-devices _sampling_subDeviceCount[driverIdx] = (uint32_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(uint32_t)); for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { - res = ZE_DEVICE_GET_SUB_DEVICES_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_subDeviceCount[driverIdx][deviceIdx], NULL); + zes_device_properties_t deviceProperties = {0}; + deviceProperties.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES; + res = ZES_DEVICE_GET_PROPERTIES_PTR(_sampling_hDevices[driverIdx][deviceIdx], &deviceProperties); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("ZE_DEVICE_GET_SUB_DEVICES_PTR", res); + _ZE_ERROR_MSG("ZES_DEVICE_GET_PROPERTIES_PTR", res); _sampling_subDeviceCount[driverIdx][deviceIdx] = 0; - } + } else + _sampling_subDeviceCount[driverIdx][deviceIdx] = deviceProperties.numSubdevices; if (_sampling_subDeviceCount[driverIdx][deviceIdx] == 0) { _sampling_subDeviceCount[driverIdx][deviceIdx] = 1; }