Skip to content

Commit

Permalink
Merge branch 'devel' into fabric_memory_stats
Browse files Browse the repository at this point in the history
  • Loading branch information
sbekele81 authored Oct 18, 2024
2 parents d549063 + b970fb3 commit 720ccd0
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 59 deletions.
8 changes: 8 additions & 0 deletions integration_tests/general.bats
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,11 @@ teardown_file() {
[ "$status" != 0 ]
rm out.pftrace
}

@test "exit_code_propagated" {
run $IPROF -- bash -c "exit 55"
[ "$status" == 55 ]

run $IPROF --no-analysis -- bash -c "exit 55"
[ "$status" == 55 ]
}
14 changes: 7 additions & 7 deletions xprof/sync_daemon_fs
Original file line number Diff line number Diff line change
Expand Up @@ -61,19 +61,19 @@ global_handle = nil
parent_pid = nil

# Set trap
Signal.trap(Sync_daemon::RT_SIGNAL_GLOBAL_BARRIER) do
Signal.trap(SyncDaemon::RT_SIGNAL_GLOBAL_BARRIER) do
global_barrier(global_handle)
Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid)
Process.kill(SyncDaemon::RT_SIGNAL_READY, parent_pid)
end

local_barier_count = 0
Signal.trap(Sync_daemon::RT_SIGNAL_LOCAL_BARRIER) do
Signal.trap(SyncDaemon::RT_SIGNAL_LOCAL_BARRIER) do
local_barier(local_barier_count.to_s)
local_barier_count += 1
Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid)
Process.kill(SyncDaemon::RT_SIGNAL_READY, parent_pid)
end

Signal.trap(Sync_daemon::RT_SIGNAL_FINISH) do
Signal.trap(SyncDaemon::RT_SIGNAL_FINISH) do
# We cannot delete SHARED_LOCAL_FILESYSTEM
# Some rank can exit the `global_barier` (hence calling this function)
# when others ranks are still in the `local_barrier`
Expand All @@ -83,12 +83,12 @@ Signal.trap(Sync_daemon::RT_SIGNAL_FINISH) do
# is to make all ranks busy_wait in the `global_barrier`.
# This will ensure that every-one exited the `local_barrier`.
# but given the poor performance of our FS, we will avoid that for now...
Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid)
Process.kill(SyncDaemon::RT_SIGNAL_READY, parent_pid)
exit
end

# Init global barrier
global_handle = init_global_barrier
parent_pid = ARGV[0].to_i
Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid)
Process.kill(SyncDaemon::RT_SIGNAL_READY, parent_pid)
sleep
116 changes: 73 additions & 43 deletions xprof/xprof.rb.in
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,18 @@ LTTNG_ARCHIVE_TIMER = '60s'
LTTNG_DIRWATCH_SIZE = '500' # In MiB
LTTNG_DIRWATCH_LOCK_RETRY_DELAY = 0.1

class XprofExitCode
@@exit_code = 0
def self.update(exit_code)
# Keep only the first error
@@exit_code = exit_code if @@exit_code == 0
end

def self.get
@@exit_code
end
end

$LOAD_PATH.unshift(DATADIR) if File.directory?(DATADIR)
require 'open3'
require 'fileutils'
Expand Down Expand Up @@ -238,7 +250,7 @@ end
# |_) _. ._ ._ o _ ._
# |_) (_| | | | (/_ |
#
class Sync_daemon
class SyncDaemon
SIGRTMIN = 34
RT_SIGNAL_READY = SIGRTMIN
RT_SIGNAL_GLOBAL_BARRIER = SIGRTMIN + 1
Expand Down Expand Up @@ -285,13 +297,13 @@ class Sync_daemon
end

LOGGER.debug { "spawn(#{daemon} #{Process.pid})" }
lazy_exec("Initialize Sync_daemon #{daemon_type}") do
lazy_exec("Initialize SyncDaemon #{daemon_type}") do
@pid = spawn("#{daemon} #{Process.pid}")
end
end

def finalize
lazy_exec('Finalize Sync_daemon') do
lazy_exec('Finalize SyncDaemon') do
`kill -#{RT_SIGNAL_FINISH} #{@pid}`
end
end
Expand All @@ -311,14 +323,15 @@ class Sync_daemon
# Context manager, ensure that when the block yield is exited
# we always call clean-up the daemon
def self.open
yield f = new
rescue StandardError
raise
yield syncd = new
rescue Errno::ENOENT
exit(1)
ensure
return unless f

f.global_barrier
f.finalize
# https://www.rubydoc.info/gems/rubocop/RuboCop/Cop/Lint/EnsureReturn
if syncd
syncd.global_barrier
syncd.finalize
end
end
end

Expand Down Expand Up @@ -348,11 +361,10 @@ def env_tracers
backends = []

[%w[opencl cl libOpenCL libTracerOpenCL],
%w[ze ze libze_loader libTracerZE],
%w[ze ze libze_loader libze_loader],
%w[cuda cuda libcuda libTracerCUDA],
%w[hip hip libamdhip64 libTracerHIP],
%w[mpi mpi libmpi libTracerMPI],
].each do |name, bt_name, lib, libtracer|
%w[mpi mpi libmpi libTracerMPI]].each do |name, bt_name, lib, libtracer|
# Backend requested, skip omp. It will be handled in a custom case bellow
next unless OPTIONS[:'backend-names'].include?(bt_name)

Expand Down Expand Up @@ -393,7 +405,7 @@ def env_tracers
h['LTTNG_UST_SAMPLING'] = 1
h['LTTNG_UST_SAMPLING_ENERGY'] = 1
# The current only reliable way to use zes api
# is to call zesInit and set ZES_ENABLE_SYSMAN to 0
# is to call zesInit and set ZES_ENABLE_SYSMAN to 0
h['ZES_ENABLE_SYSMAN'] = 0 if OPTIONS[:'backend-names'].include?('ze')
end

Expand Down Expand Up @@ -432,14 +444,20 @@ def launch_usr_bin(env, cmd)

begin
PTY.spawn(bash_env, *cmd) do |stdout, _stdin, _pid|
# Reading stdout will trigger Errno::EIO
stdout.each { |line| print line }
rescue Errno::EIO
# Wait for the PTY to finish, to set $?
Process.wait(_pid)
return $?.exitstatus
end
# Not sure how this exception can be triggered
rescue PTY::ChildExited
LOGGER.warn { 'Application Exited' }
rescue Interrupt
LOGGER.warn { 'Application Received Interrupt Signal' }
# SigINT is 2
2
rescue Errno::ENOENT
warn("#{__FILE__}: Can't find executable #{cmd.first}")
raise Errno::ENOENT
end
end

Expand Down Expand Up @@ -693,29 +711,8 @@ end

# Start, Stop lttng, amd do the on-node analsysis
def trace_and_on_node_processing(usr_argv)
# Global barrier at exit
Sync_daemon.open do |syncd|
# Load Tracers and APILoaders Lib
backends, h = env_tracers

# All ranks need to set the LLTTNG_HOME env
# so they can have access to the daemon
ENV['LTTNG_HOME'] = lttng_home_dir
LOGGER.debug("LTTNG_HOME = #{ENV.fetch('LTTNG_HOME', nil)}")

# Only local master spawn daemons (lttng, and babeltrace)
# and the start the lttng-session
pids = if mpi_local_master?
lm_setup_lttng(backends)
lm_babeltrace(backends) if OPTIONS[:archive]
end
# Other local node cannot start before lttng and the daemon
syncd.local_barrier('waiting_for_lttng_setup')
# Launch User Command
launch_usr_bin(h, usr_argv)
# We need to ensure that all the local ranks have finished
# running the user application
# before the local master stops the lttng session
def teardown_lttng(syncd, pids)
# We need to be sure that all the local ranks are finished
syncd.local_barrier('waiting_for_application_ending')

# Everything from now on, is some local-master processing
Expand All @@ -729,11 +726,40 @@ def trace_and_on_node_processing(usr_argv)
LOGGER.debug("Waiting for babeltrace_thapi and dirwatch (#{pids}) to finish")
pids.each do |pid|
Process.wait(pid)
raise "#{pid} failed" unless $?.success?
XprofExitCode.update($?.exitstatus)
Logger.warn("#{pid} failed") unless $?.success?
end
end
# we can kill the session daemon
lm_lttng_kill_sessiond
end

SyncDaemon.open do |syncd|
# Load Tracers and APILoaders Lib
backends, h = env_tracers

# All ranks need to set the LLTTNG_HOME env
# so they can have access to the daemon
ENV['LTTNG_HOME'] = lttng_home_dir
# Only local master spawn LTTNG daemon and start session
pids = if mpi_local_master?
lm_setup_lttng(backends)
lm_babeltrace(backends) if OPTIONS[:archive]
end

syncd.local_barrier('waiting_for_lttng_setup')

# Launch User Command
begin
XprofExitCode.update(launch_usr_bin(h, usr_argv))
rescue Errno::ENOENT
teardown_lttng(syncd, pids)
raise
end

teardown_lttng(syncd, pids)
return unless mpi_local_master?

# Preprocess trace
lm_babeltrace(backends) unless OPTIONS[:archive]
lm_move_to_shared
Expand Down Expand Up @@ -784,7 +810,7 @@ def gm_processing(folder)

fo.close
end
exit(1) unless $?.success?
$?.exitstatus
end

#
Expand Down Expand Up @@ -906,8 +932,12 @@ if __FILE__ == $PROGRAM_NAME
# Right now, `replay` means no tracing.
# But we don't have a way of disabling post-processing
folder = OPTIONS.include?(:replay) ? OPTIONS[:replay] || last_trace_saved : trace_and_on_node_processing(ARGV)

if mpi_master?
warn("THAPI: Trace location: #{folder}")
gm_processing(folder) if OPTIONS[:analysis]
XprofExitCode.update(gm_processing(folder)) if OPTIONS[:analysis]
end

exit(XprofExitCode.get)

end
15 changes: 8 additions & 7 deletions ze/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -172,21 +172,22 @@ libzetracepoints_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/inclu
libzetracepoints_la_CFLAGS = -fPIC -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Wno-sign-compare $(WERROR) $(LTTNG_UST_CFLAGS)
libzetracepoints_la_LDFLAGS = $(LTTNG_UST_LIBS)

lib_LTLIBRARIES = libTracerZE.la libZEInterval.la
lib_LTLIBRARIES = libze_loader.la libZEInterval.la

nodist_libTracerZE_la_SOURCES = \
nodist_libze_loader_la_SOURCES = \
$(ZE_PROBES_INCL) \
$(ZE_STATIC_PROBES_INCL) \
tracer_ze.c

libTracerZE_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I$(top_srcdir)/sampling -I$(srcdir)/include -I$(top_srcdir)/utils -I./
libTracerZE_la_CFLAGS = -Wall -Wextra $(WERROR) $(LIBFFI_CFLAGS) $(LTTNG_UST_CFLAGS)
libTracerZE_la_LDFLAGS = $(LTTNG_UST_LIBS) -ldl -lpthread $(LIBFFI_LIBS) ../sampling/libThapiSampling.la
libTracerZE_la_LIBADD = libzetracepoints.la
libze_loader_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I$(top_srcdir)/sampling -I$(srcdir)/include -I$(top_srcdir)/utils -I./
libze_loader_la_CFLAGS = -Wall -Wextra $(WERROR) $(LIBFFI_CFLAGS) $(LTTNG_UST_CFLAGS)
libze_loader_la_LDFLAGS = $(LTTNG_UST_LIBS) -ldl -lpthread $(LIBFFI_LIBS) ../sampling/libThapiSampling.la
libze_loader_la_LDFLAGS += -version-info 1:0:0
libze_loader_la_LIBADD = libzetracepoints.la

install-exec-hook:
$(MKDIR_P) $(DESTDIR)$(pkglibdir)/ze
$(LN_S) -f $(DESTDIR)$(libdir)/libTracerZE.so.0.0.0 $(DESTDIR)$(pkglibdir)/ze/libze_loader.so.1
$(LN_S) -f $(DESTDIR)$(libdir)/libze_loader.so.1.0.0 $(DESTDIR)$(pkglibdir)/ze/libze_loader.so.1
$(LN_S) -f $(DESTDIR)$(pkglibdir)/ze/libze_loader.so.1 $(DESTDIR)$(pkglibdir)/ze/libze_loader.so
$(MKDIR_P) $(DESTDIR)$(pkglibdir)/bt2
$(LN) -f $(DESTDIR)$(libdir)/libZEInterval.so $(DESTDIR)$(pkglibdir)/bt2/libZEInterval.so
Expand Down
2 changes: 1 addition & 1 deletion ze/tracer_ze.sh.in
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ then
else
export LD_LIBRARY_PATH=$pkglibdir/ze:$LD_LIBRARY_PATH
fi
export LD_PRELOAD=$libdir/libTracerZE.so:$LD_PRELOAD
export LD_PRELOAD=$libdir/libze_loader.so:$LD_PRELOAD
export LTTNG_UST_ALLOW_BLOCKING=1
export LTTNG_UST_ZE_VERBOSE=1
lttng start
Expand Down
3 changes: 2 additions & 1 deletion ze/tracer_ze_helpers.include.c
Original file line number Diff line number Diff line change
Expand Up @@ -1112,6 +1112,7 @@ static int initializeHandles() {
_sampling_hSubDevices[driverIdx] = (ze_device_handle_t **)calloc(
_sampling_deviceCount[driverIdx], sizeof(ze_device_handle_t *));
for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) {

zes_device_properties_t deviceProps = {0};
deviceProps.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
deviceProps.pNext = NULL;
Expand All @@ -1127,7 +1128,7 @@ static int initializeHandles() {
(ze_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx],
&_sampling_subDeviceCount[driverIdx][deviceIdx], NULL);
if (res != ZE_RESULT_SUCCESS) {
_ZE_ERROR_MSG("ZE_DEVICE_GET_SUB_DEVICES_PTR", res);
_ZE_ERROR_MSG("ZES_DEVICE_GET_PROPERTIES_PTR", res);
_sampling_subDeviceCount[driverIdx][deviceIdx] = 0;
}
if (_sampling_subDeviceCount[driverIdx][deviceIdx] > 0) {
Expand Down

0 comments on commit 720ccd0

Please sign in to comment.