From 19b1cbbb3af93beaa532be783ef2c21a41123f56 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Fri, 30 Aug 2024 17:30:11 -0500 Subject: [PATCH] Fix fs (#283) * fix fs * fis fs? * fix daemon * better comment * remove stupid doc * fix comment * increase timeout * fix typo --------- Co-authored-by: Thomas Applencourt Co-authored-by: Thomas Applencourt --- integration_tests/light_iprof_only_sync.sh | 14 +++++++------- integration_tests/parallel_execution.bats | 13 ++++++------- xprof/sync_daemon_fs | 11 +++++++++-- xprof/sync_daemon_mpi.c | 2 +- 4 files changed, 23 insertions(+), 17 deletions(-) diff --git a/integration_tests/light_iprof_only_sync.sh b/integration_tests/light_iprof_only_sync.sh index f6ba6fe9..5c7a4623 100755 --- a/integration_tests/light_iprof_only_sync.sh +++ b/integration_tests/light_iprof_only_sync.sh @@ -14,7 +14,7 @@ RT_SIGNAL_FINISH=$((SIGRTMIN + 3)) SIGNAL_RECEIVED="false" # Signal handler for capturing signals handle_signal() { - echo "--Received signal $1 from mpi_daemon" + echo "$PARENT_PID | Received signal $1 from mpi_daemon" if [ "$1" == "RT_SIGNAL_READY" ]; then SIGNAL_RECEIVED="true" fi @@ -42,19 +42,19 @@ PARENT_PID=$$ # Start sync daemon in the background ${THAPI_BIN_DIR}/sync_daemon_${THAPI_SYNC_DAEMON} $PARENT_PID & DAEMON_PID=$! -echo "Wait for daemon to be ready" +echo "$PARENT_PID | Wait for daemon to be ready" wait_for_signal -echo "Send Local Barrier signal" +echo "$PARENT_PID | Send Local Barrier signal" send_signal_blocking $RT_SIGNAL_LOCAL_BARRIER # Run test program "$@" # Final synchronization after mpi_hello_world execution -echo "Send Local Barrier signal" +echo "$PARENT_PID | Send Local Barrier signal" send_signal_blocking $RT_SIGNAL_LOCAL_BARRIER -echo "Send Global Barrier signal" +echo "$PARENT_PID | Send Global Barrier signal" send_signal_blocking $RT_SIGNAL_GLOBAL_BARRIER -echo "Send Termination signal" +echo "$PARENT_PID | Send Termination signal" send_signal_blocking $RT_SIGNAL_FINISH -echo "Wait for daemon to quit" +echo "$PARENT_PID | Wait for daemon to quit" wait $DAEMON_PID diff --git a/integration_tests/parallel_execution.bats b/integration_tests/parallel_execution.bats index 085128eb..05668a66 100644 --- a/integration_tests/parallel_execution.bats +++ b/integration_tests/parallel_execution.bats @@ -11,28 +11,27 @@ teardown_file() { } @test "sync_daemon_fs" { - THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN + THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN } @test "iprof_fs" { - THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN + THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN } @test "sync_daemon_fs_launching_mpi_app" { mpicc ./integration_tests/mpi_helloworld.c -o mpi_helloworld - THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld + THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld } - @test "sync_daemon_mpi" { - THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN + THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN } @test "iprof_mpi" { - THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN + THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN } @test "sync_daemon_mpi_launching_mpi_app" { mpicc ./integration_tests/mpi_helloworld.c -o mpi_helloworld - THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld + THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld } diff --git a/xprof/sync_daemon_fs b/xprof/sync_daemon_fs index affe14d3..dea845b7 100755 --- a/xprof/sync_daemon_fs +++ b/xprof/sync_daemon_fs @@ -74,8 +74,15 @@ Signal.trap(Sync_daemon::RT_SIGNAL_LOCAL_BARRIER) do end Signal.trap(Sync_daemon::RT_SIGNAL_FINISH) do - FileUtils.rm_rf(SHARED_GLOBAL_FILESYSTEM) if mpi_master? - FileUtils.rm_rf(SHARED_LOCAL_FILESYSTEM) if mpi_local_master? + # We cannot delete SHARED_LOCAL_FILESYSTEM + # Some rank can exit the `global_barier` (hence calling this function) + # when others ranks are still in the `local_barrier` + # If we delete SHARED_LOCAL_FILESYSTEM, it will deadlock + # + # One possibility to be abble to remove `SHARED_LOCAL_FILESYSTEM`, + # is to make all ranks busy_wait in the `global_barrier`. + # This will ensure that every-one exited the `local_barrier`. + # but given the poor performance of our FS, we will avoid that for now... Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid) exit end diff --git a/xprof/sync_daemon_mpi.c b/xprof/sync_daemon_mpi.c index bb9868bf..302963c4 100644 --- a/xprof/sync_daemon_mpi.c +++ b/xprof/sync_daemon_mpi.c @@ -99,7 +99,7 @@ int signal_loop(int parent_pid, MPI_Comm MPI_COMM_WORLD_THAPI, MPI_Comm MPI_COMM } else if (signum == RT_SIGNAL_GLOBAL_BARRIER) { MPI_Barrier(MPI_COMM_WORLD_THAPI); } else { - fprintf(stderr, "Wrong signal rreseved %d. Exiting", signum); + fprintf(stderr, "Wrong signal received %d. Exiting", signum); return 1; } kill(parent_pid, RT_SIGNAL_READY);