Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix fs #283

Merged
merged 8 commits into from
Aug 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions integration_tests/light_iprof_only_sync.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ RT_SIGNAL_FINISH=$((SIGRTMIN + 3))
SIGNAL_RECEIVED="false"
# Signal handler for capturing signals
handle_signal() {
echo "--Received signal $1 from mpi_daemon"
echo "$PARENT_PID | Received signal $1 from mpi_daemon"
if [ "$1" == "RT_SIGNAL_READY" ]; then
SIGNAL_RECEIVED="true"
fi
Expand Down Expand Up @@ -42,19 +42,19 @@ PARENT_PID=$$
# Start sync daemon in the background
${THAPI_BIN_DIR}/sync_daemon_${THAPI_SYNC_DAEMON} $PARENT_PID &
DAEMON_PID=$!
echo "Wait for daemon to be ready"
echo "$PARENT_PID | Wait for daemon to be ready"
wait_for_signal
echo "Send Local Barrier signal"
echo "$PARENT_PID | Send Local Barrier signal"
send_signal_blocking $RT_SIGNAL_LOCAL_BARRIER
# Run test program
"$@"

# Final synchronization after mpi_hello_world execution
echo "Send Local Barrier signal"
echo "$PARENT_PID | Send Local Barrier signal"
send_signal_blocking $RT_SIGNAL_LOCAL_BARRIER
echo "Send Global Barrier signal"
echo "$PARENT_PID | Send Global Barrier signal"
send_signal_blocking $RT_SIGNAL_GLOBAL_BARRIER
echo "Send Termination signal"
echo "$PARENT_PID | Send Termination signal"
send_signal_blocking $RT_SIGNAL_FINISH
echo "Wait for daemon to quit"
echo "$PARENT_PID | Wait for daemon to quit"
wait $DAEMON_PID
13 changes: 6 additions & 7 deletions integration_tests/parallel_execution.bats
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,27 @@ teardown_file() {
}

@test "sync_daemon_fs" {
THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN
THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN
}

@test "iprof_fs" {
THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN
THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN
}

@test "sync_daemon_fs_launching_mpi_app" {
mpicc ./integration_tests/mpi_helloworld.c -o mpi_helloworld
THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld
THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld
}


@test "sync_daemon_mpi" {
THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN
THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN
}

@test "iprof_mpi" {
THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN
THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN
}

@test "sync_daemon_mpi_launching_mpi_app" {
mpicc ./integration_tests/mpi_helloworld.c -o mpi_helloworld
THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld
THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld
}
11 changes: 9 additions & 2 deletions xprof/sync_daemon_fs
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,15 @@ Signal.trap(Sync_daemon::RT_SIGNAL_LOCAL_BARRIER) do
end

Signal.trap(Sync_daemon::RT_SIGNAL_FINISH) do
FileUtils.rm_rf(SHARED_GLOBAL_FILESYSTEM) if mpi_master?
FileUtils.rm_rf(SHARED_LOCAL_FILESYSTEM) if mpi_local_master?
# We cannot delete SHARED_LOCAL_FILESYSTEM
# Some rank can exit the `global_barier` (hence calling this function)
# when others ranks are still in the `local_barrier`
# If we delete SHARED_LOCAL_FILESYSTEM, it will deadlock
#
# One possibility to be abble to remove `SHARED_LOCAL_FILESYSTEM`,
# is to make all ranks busy_wait in the `global_barrier`.
# This will ensure that every-one exited the `local_barrier`.
# but given the poor performance of our FS, we will avoid that for now...
Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid)
exit
end
Expand Down
2 changes: 1 addition & 1 deletion xprof/sync_daemon_mpi.c
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ int signal_loop(int parent_pid, MPI_Comm MPI_COMM_WORLD_THAPI, MPI_Comm MPI_COMM
} else if (signum == RT_SIGNAL_GLOBAL_BARRIER) {
MPI_Barrier(MPI_COMM_WORLD_THAPI);
} else {
fprintf(stderr, "Wrong signal rreseved %d. Exiting", signum);
fprintf(stderr, "Wrong signal received %d. Exiting", signum);
return 1;
}
kill(parent_pid, RT_SIGNAL_READY);
Expand Down