From f42663e3451c5d3c02a809cde5c182bcb8d88958 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Wed, 24 Jul 2024 20:29:19 +0000 Subject: [PATCH 01/13] add mpi bin tests --- integration_tests/parallel_execution.bats | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/integration_tests/parallel_execution.bats b/integration_tests/parallel_execution.bats index 4a164929..a4949c49 100644 --- a/integration_tests/parallel_execution.bats +++ b/integration_tests/parallel_execution.bats @@ -11,7 +11,7 @@ teardown_file() { } @test "sync_daemon_fs" { - THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN + THAPI_SYNC_DAEMON=fs THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN } @test "iprof_fs" { @@ -19,9 +19,19 @@ teardown_file() { } @test "sync_daemon_mpi" { - THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN + THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh $THAPI_TEST_BIN } @test "iprof_mpi" { THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN } + +@test "iprof_mpi_mpi_app" { + mpicc ~/THAPI/integration_tests/ + THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN +} + +@test "sync_daemon_mpi_launching_mpi_app" { + mpicc ~/THAPI/integration_tests/mpi_helloworld.c -o mpi_helloworld + THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh -- ./mpi_helloworld +} From 5f931ed5acff278c404853045e1a235e4704890c Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Wed, 24 Jul 2024 20:35:32 +0000 Subject: [PATCH 02/13] forgot the file... --- integration_tests/mpi_helloworld.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 integration_tests/mpi_helloworld.c diff --git a/integration_tests/mpi_helloworld.c b/integration_tests/mpi_helloworld.c new file mode 100644 index 00000000..fbeac576 --- /dev/null +++ b/integration_tests/mpi_helloworld.c @@ -0,0 +1,25 @@ +#include +#include + +int main(int argc, char** argv) { + // Initialize the MPI environment + MPI_Init(&argc, &argv); + + // Get the number of processes + int world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + // Get the rank of the process + int world_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + + // Get the name of the processor + char processor_name[MPI_MAX_PROCESSOR_NAME]; + int name_len; + MPI_Get_processor_name(processor_name, &name_len); + + printf("Hello world from processor %s, , rank %d out of %d rank.\n", processor_name, world_rank, world_size); + // Finalize the MPI environment. + MPI_Finalize(); + return 0; +} From 9963aa34cf4aa0062f13a16d1e0285d469a9112d Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Wed, 24 Jul 2024 20:45:07 +0000 Subject: [PATCH 03/13] fix mistake --- integration_tests/parallel_execution.bats | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/integration_tests/parallel_execution.bats b/integration_tests/parallel_execution.bats index a4949c49..cb2aa688 100644 --- a/integration_tests/parallel_execution.bats +++ b/integration_tests/parallel_execution.bats @@ -26,12 +26,7 @@ teardown_file() { THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN } -@test "iprof_mpi_mpi_app" { - mpicc ~/THAPI/integration_tests/ - THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 $MPIRUN -n 2 $IPROF --debug 0 -- $THAPI_TEST_BIN -} - @test "sync_daemon_mpi_launching_mpi_app" { - mpicc ~/THAPI/integration_tests/mpi_helloworld.c -o mpi_helloworld + mpicc ./integration_tests/mpi_helloworld.c -o mpi_helloworld THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh -- ./mpi_helloworld } From b23d9619a52f0a642967c95b248ce151a82b621e Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Wed, 24 Jul 2024 20:52:05 +0000 Subject: [PATCH 04/13] one more stupid mistake --- integration_tests/parallel_execution.bats | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/parallel_execution.bats b/integration_tests/parallel_execution.bats index cb2aa688..eaecfc52 100644 --- a/integration_tests/parallel_execution.bats +++ b/integration_tests/parallel_execution.bats @@ -28,5 +28,5 @@ teardown_file() { @test "sync_daemon_mpi_launching_mpi_app" { mpicc ./integration_tests/mpi_helloworld.c -o mpi_helloworld - THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh -- ./mpi_helloworld + THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld } From a1357b31f12ac780039ecd5a2c6fccde4704a014 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Wed, 24 Jul 2024 21:11:57 +0000 Subject: [PATCH 05/13] more log --- integration_tests/light_iprof_only_sync.sh | 12 +++++++----- integration_tests/mpi_helloworld.c | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/integration_tests/light_iprof_only_sync.sh b/integration_tests/light_iprof_only_sync.sh index d709a143..ee2c5ba5 100755 --- a/integration_tests/light_iprof_only_sync.sh +++ b/integration_tests/light_iprof_only_sync.sh @@ -14,7 +14,7 @@ RT_SIGNAL_FINISH=$((SIGRTMIN + 3)) SIGNAL_RECEIVED="false" # Signal handler for capturing signals handle_signal() { - echo "Received signal $1 from mpi_daemon" + echo "--Received signal $1 from mpi_daemon" if [ "$1" == "RT_SIGNAL_READY" ]; then SIGNAL_RECEIVED="true" fi @@ -42,9 +42,9 @@ PARENT_PID=$$ # Start sync daemon in the background ${THAPI_BIN_DIR}/sync_daemon_${THAPI_SYNC_DAEMON} $PARENT_PID & DAEMON_PID=$! -# Wait for daemon to be ready +echo "Wait for daemon to be ready" wait_for_signal -# Send signals to mpi_daemon to test synchronization +echo "Send Local and Global Barrier signal" send_signal_blocking $RT_SIGNAL_LOCAL_BARRIER send_signal_blocking $RT_SIGNAL_GLOBAL_BARRIER @@ -52,8 +52,10 @@ send_signal_blocking $RT_SIGNAL_GLOBAL_BARRIER "$@" # Final synchronization after mpi_hello_world execution +echo "Send Local and Global Barrier signal" send_signal_blocking $RT_SIGNAL_LOCAL_BARRIER send_signal_blocking $RT_SIGNAL_GLOBAL_BARRIER -# Signal to terminate the mpi_daemon +echo "Send Termination signal" send_signal_blocking $RT_SIGNAL_FINISH -wait $DAEMON_PID # Ensure daemon exits cleanly +echo "Wait for daemon to quit" +wait $DAEMON_PID diff --git a/integration_tests/mpi_helloworld.c b/integration_tests/mpi_helloworld.c index fbeac576..c4fbd400 100644 --- a/integration_tests/mpi_helloworld.c +++ b/integration_tests/mpi_helloworld.c @@ -18,7 +18,7 @@ int main(int argc, char** argv) { int name_len; MPI_Get_processor_name(processor_name, &name_len); - printf("Hello world from processor %s, , rank %d out of %d rank.\n", processor_name, world_rank, world_size); + printf("Hello world from processor %s, rank %d out of %d rank.\n", processor_name, world_rank, world_size); // Finalize the MPI environment. MPI_Finalize(); return 0; From ec5fbd63f73b0f7cfe752847d8f68e188aaeaf5b Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Wed, 24 Jul 2024 21:20:54 +0000 Subject: [PATCH 06/13] log to debug --- integration_tests/parallel_execution.bats | 2 +- xprof/sync_daemon_mpi.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/integration_tests/parallel_execution.bats b/integration_tests/parallel_execution.bats index eaecfc52..0d480fea 100644 --- a/integration_tests/parallel_execution.bats +++ b/integration_tests/parallel_execution.bats @@ -28,5 +28,5 @@ teardown_file() { @test "sync_daemon_mpi_launching_mpi_app" { mpicc ./integration_tests/mpi_helloworld.c -o mpi_helloworld - THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 20s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld + THAPI_SYNC_DAEMON=mpi THAPI_JOBID=0 timeout 40s $MPIRUN -n 2 ./integration_tests/light_iprof_only_sync.sh ./mpi_helloworld } diff --git a/xprof/sync_daemon_mpi.c b/xprof/sync_daemon_mpi.c index 451e59bd..2060c818 100644 --- a/xprof/sync_daemon_mpi.c +++ b/xprof/sync_daemon_mpi.c @@ -143,12 +143,16 @@ int main(int argc, char **argv) { ret = signal_loop(parent_pid, MPI_COMM_WORLD_THAPI, MPI_COMM_NODE); fn_exit: + printf("MPI_Comm_free: MPI_COMM_NODE\n"); if (MPI_COMM_NODE != MPI_COMM_NULL) MPI_Comm_free(&MPI_COMM_NODE); + printf("MPI_Comm_free: MPI_COMM_WORLD_THAPI\n"); if (MPI_COMM_WORLD_THAPI != MPI_COMM_NULL) MPI_Comm_free(&MPI_COMM_WORLD_THAPI); + printf("MPI_Session_finalize\n"); if (lib_shandle != MPI_SESSION_NULL) MPI_Session_finalize(&lib_shandle); + printf("Signal RT_SIGNAL_READY \n"); if (parent_pid != 0) kill(parent_pid, RT_SIGNAL_READY); return ret; From aba601e423796f87f49a1f4fa8f6ac2b846a47df Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Wed, 24 Jul 2024 21:36:03 +0000 Subject: [PATCH 07/13] try with openmpi --- .github/workflows/presubmit.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml index 54f49cf0..4d6df7bc 100644 --- a/.github/workflows/presubmit.yml +++ b/.github/workflows/presubmit.yml @@ -105,6 +105,8 @@ jobs: steps: - uses: actions/checkout@v4 - uses: mpi4py/setup-mpi@v1 + with: + mpi: openmpi - uses: actions/cache@v4 id: babeltrace2 env: @@ -145,6 +147,8 @@ jobs: steps: - uses: actions/checkout@v4 - uses: mpi4py/setup-mpi@v1 + with: + mpi: openmpi - uses: actions/cache@v4 id: babeltrace2 env: From 677f7cf433024dc665008103d68ab415e6fc00f9 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Wed, 24 Jul 2024 21:45:11 +0000 Subject: [PATCH 08/13] try intel mpi --- .github/workflows/presubmit.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml index 4d6df7bc..3e58d978 100644 --- a/.github/workflows/presubmit.yml +++ b/.github/workflows/presubmit.yml @@ -106,7 +106,7 @@ jobs: - uses: actions/checkout@v4 - uses: mpi4py/setup-mpi@v1 with: - mpi: openmpi + mpi: intelmpi - uses: actions/cache@v4 id: babeltrace2 env: @@ -148,7 +148,7 @@ jobs: - uses: actions/checkout@v4 - uses: mpi4py/setup-mpi@v1 with: - mpi: openmpi + mpi: intelmpi - uses: actions/cache@v4 id: babeltrace2 env: From 62231c9dced90ead9c572a8d4ca3817be5aeab06 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Wed, 24 Jul 2024 21:53:44 +0000 Subject: [PATCH 09/13] try again with intel... --- xprof/sync_daemon_mpi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xprof/sync_daemon_mpi.c b/xprof/sync_daemon_mpi.c index 2060c818..2b01b6f1 100644 --- a/xprof/sync_daemon_mpi.c +++ b/xprof/sync_daemon_mpi.c @@ -37,9 +37,10 @@ int MPIX_Init_Session(MPI_Session *lib_shandle, MPI_Comm *lib_comm) { /* * check we got thread support level foo library needs */ +/* CHECK_MPI(MPI_Session_get_info(*lib_shandle, &tinfo)); { - char out_value[100]; /* large enough */ + char out_value[100]; // int valuelen = sizeof(out_value); int flag; CHECK_MPI(MPI_Info_get_string(tinfo, mt_key, &valuelen, out_value, &flag)); @@ -49,6 +50,7 @@ int MPIX_Init_Session(MPI_Session *lib_shandle, MPI_Comm *lib_comm) { fprintf(stderr, "THAPI_SYNC_DAEMON_MPI Warning: Did not get MPI_THREAD_SINGLE, got %s\n", out_value); } +*/ /* * create a group from the WORLD process set */ From 98e04fd649731484ffd917f125322f471fe04ab7 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Wed, 24 Jul 2024 22:04:40 +0000 Subject: [PATCH 10/13] Working!! --- xprof/sync_daemon_mpi.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/xprof/sync_daemon_mpi.c b/xprof/sync_daemon_mpi.c index 2b01b6f1..5da5b90d 100644 --- a/xprof/sync_daemon_mpi.c +++ b/xprof/sync_daemon_mpi.c @@ -37,10 +37,11 @@ int MPIX_Init_Session(MPI_Session *lib_shandle, MPI_Comm *lib_comm) { /* * check we got thread support level foo library needs */ -/* + + /* Intel MPI doesn't support MPI_Info_get_string CHECK_MPI(MPI_Session_get_info(*lib_shandle, &tinfo)); { - char out_value[100]; // + char out_value[100]; int valuelen = sizeof(out_value); int flag; CHECK_MPI(MPI_Info_get_string(tinfo, mt_key, &valuelen, out_value, &flag)); @@ -50,7 +51,8 @@ int MPIX_Init_Session(MPI_Session *lib_shandle, MPI_Comm *lib_comm) { fprintf(stderr, "THAPI_SYNC_DAEMON_MPI Warning: Did not get MPI_THREAD_SINGLE, got %s\n", out_value); } -*/ + */ + /* * create a group from the WORLD process set */ @@ -145,16 +147,12 @@ int main(int argc, char **argv) { ret = signal_loop(parent_pid, MPI_COMM_WORLD_THAPI, MPI_COMM_NODE); fn_exit: - printf("MPI_Comm_free: MPI_COMM_NODE\n"); if (MPI_COMM_NODE != MPI_COMM_NULL) MPI_Comm_free(&MPI_COMM_NODE); - printf("MPI_Comm_free: MPI_COMM_WORLD_THAPI\n"); if (MPI_COMM_WORLD_THAPI != MPI_COMM_NULL) MPI_Comm_free(&MPI_COMM_WORLD_THAPI); - printf("MPI_Session_finalize\n"); if (lib_shandle != MPI_SESSION_NULL) MPI_Session_finalize(&lib_shandle); - printf("Signal RT_SIGNAL_READY \n"); if (parent_pid != 0) kill(parent_pid, RT_SIGNAL_READY); return ret; From 70a7af78094c4bf2836754d69bdde29906b1f58b Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Thu, 25 Jul 2024 11:34:23 -0500 Subject: [PATCH 11/13] Apply suggestions from code review Co-authored-by: Brice Videau --- xprof/sync_daemon_mpi.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/xprof/sync_daemon_mpi.c b/xprof/sync_daemon_mpi.c index 5da5b90d..68bc76d4 100644 --- a/xprof/sync_daemon_mpi.c +++ b/xprof/sync_daemon_mpi.c @@ -37,22 +37,18 @@ int MPIX_Init_Session(MPI_Session *lib_shandle, MPI_Comm *lib_comm) { /* * check we got thread support level foo library needs */ - - /* Intel MPI doesn't support MPI_Info_get_string CHECK_MPI(MPI_Session_get_info(*lib_shandle, &tinfo)); { - char out_value[100]; + char out_value[100] = {0}; int valuelen = sizeof(out_value); int flag; - CHECK_MPI(MPI_Info_get_string(tinfo, mt_key, &valuelen, out_value, &flag)); + CHECK_MPI(MPI_Info_get(tinfo, mt_key, &valuelen, out_value, &flag)); if (flag == 0) fprintf(stderr, "THAPI_SYNC_DAEMON_MPI Warning: Could not find key %s\n", mt_key); if (strcmp(out_value, mt_value)) fprintf(stderr, "THAPI_SYNC_DAEMON_MPI Warning: Did not get MPI_THREAD_SINGLE, got %s\n", out_value); } - */ - /* * create a group from the WORLD process set */ From e00c521f05ffb77a40b348fbe64c35988f810392 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Thu, 25 Jul 2024 16:51:45 +0000 Subject: [PATCH 12/13] fix error --- xprof/sync_daemon_mpi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xprof/sync_daemon_mpi.c b/xprof/sync_daemon_mpi.c index 68bc76d4..fab8aeb1 100644 --- a/xprof/sync_daemon_mpi.c +++ b/xprof/sync_daemon_mpi.c @@ -45,9 +45,9 @@ int MPIX_Init_Session(MPI_Session *lib_shandle, MPI_Comm *lib_comm) { CHECK_MPI(MPI_Info_get(tinfo, mt_key, &valuelen, out_value, &flag)); if (flag == 0) fprintf(stderr, "THAPI_SYNC_DAEMON_MPI Warning: Could not find key %s\n", mt_key); - if (strcmp(out_value, mt_value)) - fprintf(stderr, "THAPI_SYNC_DAEMON_MPI Warning: Did not get MPI_THREAD_SINGLE, got %s\n", - out_value); + else if (strcmp(out_value, mt_value)) + fprintf(stderr, "THAPI_SYNC_DAEMON_MPI Warning: Did not get %s, got %s\n", + mt_value, out_value); } /* * create a group from the WORLD process set From 5b475ed6d946e8e82d73731872f1e8e799d6cdbc Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Thu, 25 Jul 2024 17:04:05 +0000 Subject: [PATCH 13/13] fix value --- xprof/sync_daemon_mpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xprof/sync_daemon_mpi.c b/xprof/sync_daemon_mpi.c index fab8aeb1..601ad2f4 100644 --- a/xprof/sync_daemon_mpi.c +++ b/xprof/sync_daemon_mpi.c @@ -42,7 +42,7 @@ int MPIX_Init_Session(MPI_Session *lib_shandle, MPI_Comm *lib_comm) { char out_value[100] = {0}; int valuelen = sizeof(out_value); int flag; - CHECK_MPI(MPI_Info_get(tinfo, mt_key, &valuelen, out_value, &flag)); + CHECK_MPI(MPI_Info_get(tinfo, mt_key, valuelen, out_value, &flag)); if (flag == 0) fprintf(stderr, "THAPI_SYNC_DAEMON_MPI Warning: Could not find key %s\n", mt_key); else if (strcmp(out_value, mt_value))