Skip to content

Commit

Permalink
Fix fs (#283)
Browse files Browse the repository at this point in the history
* fix fs

* fis fs?

* fix daemon

* better comment

* remove stupid doc

* fix comment

* increase timeout

* fix typo

---------

Co-authored-by: Thomas Applencourt <[email protected]>
Co-authored-by: Thomas Applencourt <[email protected]>
3 people authored Aug 30, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent e1d1692 commit 19b1cbb
Showing 4 changed files with 23 additions and 17 deletions.
14 changes: 7 additions & 7 deletions integration_tests/light_iprof_only_sync.sh
Original file line number Diff line number Diff line change
@@ -14,7 +14,7 @@ RT_SIGNAL_FINISH=$((SIGRTMIN + 3))
SIGNAL_RECEIVED="false"
# Signal handler for capturing signals
handle_signal() {
echo "--Received signal $1 from mpi_daemon"
echo "$PARENT_PID | Received signal $1 from mpi_daemon"
if [ "$1" == "RT_SIGNAL_READY" ]; then
SIGNAL_RECEIVED="true"
fi
@@ -42,19 +42,19 @@ PARENT_PID=$$
# Start sync daemon in the background
${THAPI_BIN_DIR}/sync_daemon_${THAPI_SYNC_DAEMON} $PARENT_PID &
DAEMON_PID=$!
echo "Wait for daemon to be ready"
echo "$PARENT_PID | Wait for daemon to be ready"
wait_for_signal
echo "Send Local Barrier signal"
echo "$PARENT_PID | Send Local Barrier signal"
send_signal_blocking $RT_SIGNAL_LOCAL_BARRIER
# Run test program
"$@"

# Final synchronization after mpi_hello_world execution
echo "Send Local Barrier signal"
echo "$PARENT_PID | Send Local Barrier signal"
send_signal_blocking $RT_SIGNAL_LOCAL_BARRIER
echo "Send Global Barrier signal"
echo "$PARENT_PID | Send Global Barrier signal"
send_signal_blocking $RT_SIGNAL_GLOBAL_BARRIER
echo "Send Termination signal"
echo "$PARENT_PID | Send Termination signal"
send_signal_blocking $RT_SIGNAL_FINISH
echo "Wait for daemon to quit"
echo "$PARENT_PID | Wait for daemon to quit"
wait $DAEMON_PID
13 changes: 6 additions & 7 deletions integration_tests/parallel_execution.bats
Original file line number Diff line number Diff line change
@@ -11,28 +11,27 @@ teardown_file() {
}

@test "sync_daemon_fs" {
THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN
THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN
}

@test "iprof_fs" {
THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN
THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN
}

@test "sync_daemon_fs_launching_mpi_app" {
mpicc ./integration_tests/mpi_helloworld.c -o mpi_helloworld
THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld
THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld
}


@test "sync_daemon_mpi" {
THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN
THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN
}

@test "iprof_mpi" {
THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN
THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN
}

@test "sync_daemon_mpi_launching_mpi_app" {
mpicc ./integration_tests/mpi_helloworld.c -o mpi_helloworld
THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld
THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld
}
11 changes: 9 additions & 2 deletions xprof/sync_daemon_fs
Original file line number Diff line number Diff line change
@@ -74,8 +74,15 @@ Signal.trap(Sync_daemon::RT_SIGNAL_LOCAL_BARRIER) do
end

Signal.trap(Sync_daemon::RT_SIGNAL_FINISH) do
FileUtils.rm_rf(SHARED_GLOBAL_FILESYSTEM) if mpi_master?
FileUtils.rm_rf(SHARED_LOCAL_FILESYSTEM) if mpi_local_master?
# We cannot delete SHARED_LOCAL_FILESYSTEM
# Some rank can exit the `global_barier` (hence calling this function)
# when others ranks are still in the `local_barrier`
# If we delete SHARED_LOCAL_FILESYSTEM, it will deadlock
#
# One possibility to be abble to remove `SHARED_LOCAL_FILESYSTEM`,
# is to make all ranks busy_wait in the `global_barrier`.
# This will ensure that every-one exited the `local_barrier`.
# but given the poor performance of our FS, we will avoid that for now...
Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid)
exit
end
2 changes: 1 addition & 1 deletion xprof/sync_daemon_mpi.c
Original file line number Diff line number Diff line change
@@ -99,7 +99,7 @@ int signal_loop(int parent_pid, MPI_Comm MPI_COMM_WORLD_THAPI, MPI_Comm MPI_COMM
} else if (signum == RT_SIGNAL_GLOBAL_BARRIER) {
MPI_Barrier(MPI_COMM_WORLD_THAPI);
} else {
fprintf(stderr, "Wrong signal rreseved %d. Exiting", signum);
fprintf(stderr, "Wrong signal received %d. Exiting", signum);
return 1;
}
kill(parent_pid, RT_SIGNAL_READY);

0 comments on commit 19b1cbb

Please sign in to comment.