From 0f6a74790265e39d468bdc3178a5964ab9c33039 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Fri, 30 Aug 2024 21:09:35 +0000 Subject: [PATCH 1/8] fix fs --- integration_tests/light_iprof_only_sync.sh | 14 +++++++------- xprof/sync_daemon_fs | 15 +++++++++++---- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/integration_tests/light_iprof_only_sync.sh b/integration_tests/light_iprof_only_sync.sh index f6ba6fe9..5c7a4623 100755 --- a/integration_tests/light_iprof_only_sync.sh +++ b/integration_tests/light_iprof_only_sync.sh @@ -14,7 +14,7 @@ RT_SIGNAL_FINISH=$((SIGRTMIN + 3)) SIGNAL_RECEIVED="false" # Signal handler for capturing signals handle_signal() { - echo "--Received signal $1 from mpi_daemon" + echo "$PARENT_PID | Received signal $1 from mpi_daemon" if [ "$1" == "RT_SIGNAL_READY" ]; then SIGNAL_RECEIVED="true" fi @@ -42,19 +42,19 @@ PARENT_PID=$$ # Start sync daemon in the background ${THAPI_BIN_DIR}/sync_daemon_${THAPI_SYNC_DAEMON} $PARENT_PID & DAEMON_PID=$! -echo "Wait for daemon to be ready" +echo "$PARENT_PID | Wait for daemon to be ready" wait_for_signal -echo "Send Local Barrier signal" +echo "$PARENT_PID | Send Local Barrier signal" send_signal_blocking $RT_SIGNAL_LOCAL_BARRIER # Run test program "$@" # Final synchronization after mpi_hello_world execution -echo "Send Local Barrier signal" +echo "$PARENT_PID | Send Local Barrier signal" send_signal_blocking $RT_SIGNAL_LOCAL_BARRIER -echo "Send Global Barrier signal" +echo "$PARENT_PID | Send Global Barrier signal" send_signal_blocking $RT_SIGNAL_GLOBAL_BARRIER -echo "Send Termination signal" +echo "$PARENT_PID | Send Termination signal" send_signal_blocking $RT_SIGNAL_FINISH -echo "Wait for daemon to quit" +echo "$PARENT_PID | Wait for daemon to quit" wait $DAEMON_PID diff --git a/xprof/sync_daemon_fs b/xprof/sync_daemon_fs index affe14d3..3b87a319 100755 --- a/xprof/sync_daemon_fs +++ b/xprof/sync_daemon_fs @@ -74,10 +74,17 @@ Signal.trap(Sync_daemon::RT_SIGNAL_LOCAL_BARRIER) do end Signal.trap(Sync_daemon::RT_SIGNAL_FINISH) do - FileUtils.rm_rf(SHARED_GLOBAL_FILESYSTEM) if mpi_master? - FileUtils.rm_rf(SHARED_LOCAL_FILESYSTEM) if mpi_local_master? - Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid) - exit + # Rmdir will raise if the directory not empty + # Some process may still be in the `busy_wait` of the barrier + # when other finished. + # If we remove the folder, it will deadlock + begin + Dir.rmdir(SHARED_LOCAL_FILESYSTEM) if mpi_local_master? + Dir.rmdir(SHARED_GLOBAL_FILESYSTEM) if mpi_master? + rescue + sleep(1) + retry + end end # Init global barrier From 8db2a5fd0ddaec33ec693c4924bd208a21bf44cc Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Fri, 30 Aug 2024 21:35:40 +0000 Subject: [PATCH 2/8] fis fs? --- xprof/sync_daemon_fs | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/xprof/sync_daemon_fs b/xprof/sync_daemon_fs index 3b87a319..c0782f8b 100755 --- a/xprof/sync_daemon_fs +++ b/xprof/sync_daemon_fs @@ -74,17 +74,16 @@ Signal.trap(Sync_daemon::RT_SIGNAL_LOCAL_BARRIER) do end Signal.trap(Sync_daemon::RT_SIGNAL_FINISH) do - # Rmdir will raise if the directory not empty - # Some process may still be in the `busy_wait` of the barrier - # when other finished. - # If we remove the folder, it will deadlock - begin - Dir.rmdir(SHARED_LOCAL_FILESYSTEM) if mpi_local_master? - Dir.rmdir(SHARED_GLOBAL_FILESYSTEM) if mpi_master? - rescue - sleep(1) - retry - end + # We cannot delete SHARED_LOCAL_FILESYSTEM + # As `local_barrier` count up to a number of file + # So some ranks maybe `sleeping` waiting for the correct number of file. + # If we remove `SHARED_LOCAL_FILESYSTEM`, it will deadlock + + # We can always `SHARED_lOCAL_FILESYSTEM`, as it's count down to 0. + # Check `global_barrier` + Dir.rmdir(SHARED_GLOBAL_FILESYSTEM) if mpi_master? + Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid) + exit end # Init global barrier From d114a0154c0013d1c4f4599e12e343a02eeb9122 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Fri, 30 Aug 2024 21:52:45 +0000 Subject: [PATCH 3/8] fix daemon --- xprof/sync_daemon_fs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/xprof/sync_daemon_fs b/xprof/sync_daemon_fs index c0782f8b..3946081c 100755 --- a/xprof/sync_daemon_fs +++ b/xprof/sync_daemon_fs @@ -75,13 +75,12 @@ end Signal.trap(Sync_daemon::RT_SIGNAL_FINISH) do # We cannot delete SHARED_LOCAL_FILESYSTEM - # As `local_barrier` count up to a number of file - # So some ranks maybe `sleeping` waiting for the correct number of file. - # If we remove `SHARED_LOCAL_FILESYSTEM`, it will deadlock - - # We can always `SHARED_lOCAL_FILESYSTEM`, as it's count down to 0. - # Check `global_barrier` - Dir.rmdir(SHARED_GLOBAL_FILESYSTEM) if mpi_master? + # As even some rank can exit the `global_barier` + # when some other are still in the `local_barrier` + # One possibility to be abble to remove `SHARED_LOCAL_FILESYSTEM`, + # if to make all rank busy_wait in the `global_barrier`. + # This will ensure that everoone exited the `local_barrier`. + # but given the poor performance of our FS, we will avoid that for now... Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid) exit end From e72ea6c983e76c8e20cb35a3d980d0a27703847e Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Fri, 30 Aug 2024 22:02:04 +0000 Subject: [PATCH 4/8] better comment --- integration_tests/parallel_execution.bats | 4 +++- xprof/sync_daemon_fs | 12 +++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/integration_tests/parallel_execution.bats b/integration_tests/parallel_execution.bats index 085128eb..d6df6626 100644 --- a/integration_tests/parallel_execution.bats +++ b/integration_tests/parallel_execution.bats @@ -10,6 +10,9 @@ teardown_file() { rm -rf $THAPI_HOME/thapi-traces } +# To avoid race condition, the HAPI_SYNC_DAEMON=f tests need to be run: +# - on one node, +# - or tracing a MPI appplication @test "sync_daemon_fs" { THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN } @@ -23,7 +26,6 @@ teardown_file() { THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld } - @test "sync_daemon_mpi" { THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN } diff --git a/xprof/sync_daemon_fs b/xprof/sync_daemon_fs index 3946081c..93533dac 100755 --- a/xprof/sync_daemon_fs +++ b/xprof/sync_daemon_fs @@ -75,12 +75,14 @@ end Signal.trap(Sync_daemon::RT_SIGNAL_FINISH) do # We cannot delete SHARED_LOCAL_FILESYSTEM - # As even some rank can exit the `global_barier` - # when some other are still in the `local_barrier` + # Some rank can exit the `global_barier`, as hence call this function + # when others rake are still in the `local_barrier` + # If we delete SHARED_LOCAL_FILESYSTEM, it will deadlock + # # One possibility to be abble to remove `SHARED_LOCAL_FILESYSTEM`, - # if to make all rank busy_wait in the `global_barrier`. - # This will ensure that everoone exited the `local_barrier`. - # but given the poor performance of our FS, we will avoid that for now... + # is to make all ranks busy_wait in the `global_barrier`. + # This will ensure that every-one exited the `local_barrier`. + # but given the poor performance of our FS, we will avoid that for now... Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid) exit end From 94a2c7390053db36df401871796ec0309cfb8cf2 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Fri, 30 Aug 2024 22:05:59 +0000 Subject: [PATCH 5/8] remove stupid doc --- integration_tests/parallel_execution.bats | 3 --- 1 file changed, 3 deletions(-) diff --git a/integration_tests/parallel_execution.bats b/integration_tests/parallel_execution.bats index d6df6626..19d26768 100644 --- a/integration_tests/parallel_execution.bats +++ b/integration_tests/parallel_execution.bats @@ -10,9 +10,6 @@ teardown_file() { rm -rf $THAPI_HOME/thapi-traces } -# To avoid race condition, the HAPI_SYNC_DAEMON=f tests need to be run: -# - on one node, -# - or tracing a MPI appplication @test "sync_daemon_fs" { THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN } From 56e6f763286b72698fd8243764eb525b7d2101cb Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Fri, 30 Aug 2024 22:08:05 +0000 Subject: [PATCH 6/8] fix comment --- xprof/sync_daemon_fs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xprof/sync_daemon_fs b/xprof/sync_daemon_fs index 93533dac..dea845b7 100755 --- a/xprof/sync_daemon_fs +++ b/xprof/sync_daemon_fs @@ -75,8 +75,8 @@ end Signal.trap(Sync_daemon::RT_SIGNAL_FINISH) do # We cannot delete SHARED_LOCAL_FILESYSTEM - # Some rank can exit the `global_barier`, as hence call this function - # when others rake are still in the `local_barrier` + # Some rank can exit the `global_barier` (hence calling this function) + # when others ranks are still in the `local_barrier` # If we delete SHARED_LOCAL_FILESYSTEM, it will deadlock # # One possibility to be abble to remove `SHARED_LOCAL_FILESYSTEM`, From b91101a3cfb9aab90639a7ad9d73c6dca89613f0 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Fri, 30 Aug 2024 22:17:47 +0000 Subject: [PATCH 7/8] increase timeout --- integration_tests/parallel_execution.bats | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/integration_tests/parallel_execution.bats b/integration_tests/parallel_execution.bats index 19d26768..05668a66 100644 --- a/integration_tests/parallel_execution.bats +++ b/integration_tests/parallel_execution.bats @@ -11,27 +11,27 @@ teardown_file() { } @test "sync_daemon_fs" { - THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN + THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN } @test "iprof_fs" { - THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN + THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN } @test "sync_daemon_fs_launching_mpi_app" { mpicc ./integration_tests/mpi_helloworld.c -o mpi_helloworld - THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld + THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld } @test "sync_daemon_mpi" { - THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN + THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN } @test "iprof_mpi" { - THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN + THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN } @test "sync_daemon_mpi_launching_mpi_app" { mpicc ./integration_tests/mpi_helloworld.c -o mpi_helloworld - THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld + THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld } From 5d1634da2af136abf315482284098fdcc9e3fc29 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Fri, 30 Aug 2024 22:19:41 +0000 Subject: [PATCH 8/8] fix typo --- xprof/sync_daemon_mpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xprof/sync_daemon_mpi.c b/xprof/sync_daemon_mpi.c index bb9868bf..302963c4 100644 --- a/xprof/sync_daemon_mpi.c +++ b/xprof/sync_daemon_mpi.c @@ -99,7 +99,7 @@ int signal_loop(int parent_pid, MPI_Comm MPI_COMM_WORLD_THAPI, MPI_Comm MPI_COMM } else if (signum == RT_SIGNAL_GLOBAL_BARRIER) { MPI_Barrier(MPI_COMM_WORLD_THAPI); } else { - fprintf(stderr, "Wrong signal rreseved %d. Exiting", signum); + fprintf(stderr, "Wrong signal received %d. Exiting", signum); return 1; } kill(parent_pid, RT_SIGNAL_READY);