Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Race Condition numero2323 #284

Merged
merged 3 commits into from
Sep 3, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 22 additions & 18 deletions integration_tests/light_iprof_only_sync.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#!/bin/bash
set -euo pipefail

# For loging and Daemon to send signal to us
PARENT_PID=$$

# Get base real-time signal number
SIGRTMIN=$(kill -l SIGRTMIN)

Expand All @@ -10,11 +13,9 @@ RT_SIGNAL_GLOBAL_BARRIER=$((SIGRTMIN + 1))
RT_SIGNAL_LOCAL_BARRIER=$((SIGRTMIN + 2))
RT_SIGNAL_FINISH=$((SIGRTMIN + 3))

# Initialize a variable to track signal reception
SIGNAL_RECEIVED="false"
# Signal handler for capturing signals
handle_signal() {
echo "$PARENT_PID | Received signal $1 from mpi_daemon"
echo "$PARENT_PID $(date) | Received signal $1 from sync_daemon"
if [ "$1" == "RT_SIGNAL_READY" ]; then
SIGNAL_RECEIVED="true"
fi
Expand All @@ -25,36 +26,39 @@ trap 'handle_signal RT_SIGNAL_READY' $RT_SIGNAL_READY

# Function to wait for RT_SIGNAL_READY
wait_for_signal() {
SIGNAL_RECEIVED="false"
while [[ "$SIGNAL_RECEIVED" == "false" ]]; do
sleep 0.1 # Small sleep to prevent busy looping
done
}

# Function to send signals, using adjusted SIGRTMIN corresponding to MPI signal daemon defines
# To avoid race condition, `SIGNAL_RECEIVED` need to be set
# before spawning or signaling the daemon
spawn_daemon_blocking() {
SIGNAL_RECEIVED="false"
"${THAPI_BIN_DIR}"/sync_daemon_"${THAPI_SYNC_DAEMON}" $PARENT_PID &
DAEMON_PID=$!
wait_for_signal
}

send_signal_blocking() {
kill -$1 $DAEMON_PID
SIGNAL_RECEIVED="false"
kill -"$1" $DAEMON_PID
wait_for_signal
}

# Get the PID of this script
PARENT_PID=$$
# Start sync daemon in the background
${THAPI_BIN_DIR}/sync_daemon_${THAPI_SYNC_DAEMON} $PARENT_PID &
DAEMON_PID=$!
echo "$PARENT_PID | Wait for daemon to be ready"
wait_for_signal
echo "$PARENT_PID | Send Local Barrier signal"
echo "$PARENT_PID $(date) | Spawn Daemon"
spawn_daemon_blocking
echo "$PARENT_PID $(date) | Send Local Barrier signal"
send_signal_blocking $RT_SIGNAL_LOCAL_BARRIER
# Run test program
"$@"

# Final synchronization after mpi_hello_world execution
echo "$PARENT_PID | Send Local Barrier signal"
echo "$PARENT_PID $(date) | Send Local Barrier signal"
send_signal_blocking $RT_SIGNAL_LOCAL_BARRIER
echo "$PARENT_PID | Send Global Barrier signal"
echo "$PARENT_PID $(date) | Send Global Barrier signal"
send_signal_blocking $RT_SIGNAL_GLOBAL_BARRIER
echo "$PARENT_PID | Send Termination signal"
echo "$PARENT_PID $(date) | Send Termination signal"
send_signal_blocking $RT_SIGNAL_FINISH
echo "$PARENT_PID | Wait for daemon to quit"
echo "$PARENT_PID $(date) | Wait for daemon to quit"
wait $DAEMON_PID