Skip to content

Commit

Permalink
Better error message when bin doesn't exist (#291)
Browse files Browse the repository at this point in the history
* better error message when bin doesn't exist

* fix daemon not run

---------

Co-authored-by: Thomas Applencourt <[email protected]>
  • Loading branch information
TApplencourt and Thomas Applencourt authored Oct 1, 2024
1 parent 0b50b1a commit 1d23af6
Showing 1 changed file with 34 additions and 17 deletions.
51 changes: 34 additions & 17 deletions xprof/xprof.rb.in
Original file line number Diff line number Diff line change
Expand Up @@ -307,12 +307,14 @@ class Sync_daemon
# we always call clean-up the daemon
def self.open
yield f = new
rescue StandardError
raise
rescue Errno::ENOENT
exit(1)
ensure
return unless f
f.global_barrier
f.finalize
# https://www.rubydoc.info/gems/rubocop/RuboCop/Cop/Lint/EnsureReturn
if f
f.global_barrier
f.finalize
end
end
end

Expand Down Expand Up @@ -346,8 +348,7 @@ def env_tracers
%w[ze ze libze_loader libTracerZE],
%w[cuda cuda libcuda libTracerCUDA],
%w[hip hip libamdhip64 libTracerHIP],
%w[mpi mpi libmpi libTracerMPI],
].each do |name, bt_name, lib, libtracer|
%w[mpi mpi libmpi libTracerMPI]].each do |name, bt_name, lib, libtracer|
# Backend requested, skip omp. It will be handled in a custom case bellow
next unless OPTIONS[:'backend-names'].include?(bt_name)

Expand Down Expand Up @@ -430,6 +431,9 @@ def launch_usr_bin(env, cmd)
LOGGER.warn { 'Application Exited' }
rescue Interrupt
LOGGER.warn { 'Application Received Interrupt Signal' }
rescue Errno::ENOENT
warn("#{__FILE__}: Can't find executable #{cmd.first}")
raise Errno::ENOENT
end
end

Expand Down Expand Up @@ -570,11 +574,13 @@ end

def lm_lttng_teardown_session
raise unless mpi_local_master?

exec("lttng destroy #{lttng_session_uuid}")
end

def lm_lttng_kill_sessiond
raise unless mpi_local_master?

# Need to kill the sessiond Daemon. It's safe because each job has their own
#
# In theory, opening the lttng-sessiond.pid file is racy.
Expand Down Expand Up @@ -650,7 +656,19 @@ end

# Start, Stop lttng, amd do the on-node analsysis
def trace_and_on_node_processing(usr_argv)
# Global barrier at exit
def teardown_lttng(syncd)
# We need to be sure that all the local ranks are finished
# before the local master stops the lttng session
syncd.local_barrier('waiting_for_application_ending')
return unless mpi_local_master?

# Stop Lttng session
lm_lttng_teardown_session
# Lttng session is finished,
# we can kill the session daemon
lm_lttng_kill_sessiond
end

Sync_daemon.open do |syncd|
# Load Tracers and APILoaders Lib
backends, h = env_tracers
Expand All @@ -661,19 +679,18 @@ def trace_and_on_node_processing(usr_argv)
# Only local master spawn LTTNG daemon and start session
lm_setup_lttng(backends) if mpi_local_master?
syncd.local_barrier('waiting_for_lttng_setup')

# Launch User Command
launch_usr_bin(h, usr_argv)
begin
launch_usr_bin(h, usr_argv)
rescue Errno::ENOENT
teardown_lttng(syncd)
raise
end

# We need to be sure that all the local ranks are finished
# before the local master stops the lttng session
syncd.local_barrier('waiting_for_application_ending')
teardown_lttng(syncd)
return unless mpi_local_master?

# Stop Lttng session
lm_lttng_teardown_session
# Lttng session is finished,
# we can kill the session daemon
lm_lttng_kill_sessiond
# Preprocess trace
lm_babeltrace(backends)
lm_move_to_shared
Expand Down

0 comments on commit 1d23af6

Please sign in to comment.