pmodels · hzhou · Oct 2, 2024 · Oct 2, 2024 · Nov 3, 2024
diff --git a/test/mpi/bench/.gitignore b/test/mpi/bench/.gitignore
@@ -1,3 +1,4 @@
 /*.c
 /p2p_bw
 /p2p_latency
+/bcast
diff --git a/test/mpi/bench/Makefile.am b/test/mpi/bench/Makefile.am
@@ -11,7 +11,8 @@ LDADD += -lm
 ## correctly
 noinst_PROGRAMS = \
     p2p_latency \
-    p2p_bw
+    p2p_bw \
+    bcast
 
 .def.c:
 	mydef_page $<
diff --git a/test/mpi/bench/barrier.def b/test/mpi/bench/barrier.def
@@ -0,0 +1,6 @@
+include: macros/bench_frame.def
+include: macros/bench_coll.def
+
+page: barrier, bench_frame
+    $for 0:10
+        bench_barrier(comm)
diff --git a/test/mpi/bench/bcast.def b/test/mpi/bench/bcast.def
@@ -0,0 +1,32 @@
+include: macros/bench_frame.def
+include: macros/bench_coll.def
+include: macros/mtest.def
+
+page: bcast, bench_frame
+    data: buf, size, MPI_CHAR
+
+    $global root=0
+    $(if:0)
+        &call measure_with_barrier
+            MPI_Bcast($(data), root, comm)
+    $(else)
+        $if grank == 0
+            $call header_coll_latency
+        &call foreach_size
+            $my tf_min, tf_max, tf_avg, tf_sigma
+            $(set:MIN_ITER=0.001/tf_max)
+            &call coll_warmup
+                measure_bcast(iter, root, comm, buf, size, &tf_min, &tf_max, &tf_avg, &tf_sigma)
+                tf_dur = tf_max
+            $if iter < 100
+                iter = 100
+            measure_bcast(iter, root, comm, buf, size, &tf_min, &tf_max, &tf_avg, &tf_sigma)
+            $if grank == 0
+                $call report_coll_latency, size
+
+fncode: measure_bcast(int iter, int root, comm, buf, size, pf_min, pf_max, pf_avg, pf_sigma)
+    &call measure_coll_latency, iter
+        MPI_Bcast($(data), root, comm)
+    $(for:min,max,avg,sigma)
+        *pf_$1 = tf_$1
+
diff --git a/test/mpi/bench/macros/bench_coll.def b/test/mpi/bench/macros/bench_coll.def
@@ -0,0 +1,77 @@
+subcode: coll_warmup
+    $if grank == 0
+        &call warm_up, iter, tf_dur
+            MPI_Bcast(&iter, 1, MPI_INT, 0, comm)
+            BLOCK
+            # $dump tf_dur, iter, num_best
+        $my tn_zero = 0
+        MPI_Bcast(&tn_zero, 1, MPI_INT, 0, comm)
+    $else
+        $while 1
+            MPI_Bcast(&iter, 1, MPI_INT, 0, comm)
+            $if iter == 0 
+                break
+            BLOCK
+
+    MPI_Bcast(&iter, 1, MPI_INT, 0, comm)
+
+subcode: measure_with_barrier
+    tf_barrier = bench_barrier(comm)
+    $if grank == 0
+        $call header_latency
+    &call foreach_size
+        &call coll_warmup
+            &call measure, iter
+                BLOCK
+                MPI_Barrier(comm)
+        &call run_stat, NUM_REPEAT, tf_latency
+            &call measure, iter
+                BLOCK
+                MPI_Barrier(comm)
+            tf_latency = (tf_dur / iter) - tf_barrier
+        $if grank == 0
+            $call report_latency, size, 1
+
+# Measure individual latency after a barrier (as osu_bcast)
+subcode: measure_coll_latency(iter)
+    $my tf_max, tf_min, tf_avg, tf_sigma # output variables
+
+    $(set:RUN_STAT_VARIANCE=1)
+    &call run_stat, $(iter), tf_latency
+        MPI_Barrier(comm)
+        tf_start = MPI_Wtime()
+        BLOCK
+        tf_latency = (MPI_Wtime() - tf_start)
+
+    $(for:max,min,avg and MAX,MIN,SUM)
+        MPI_Reduce(&sum1, &tf_$1, 1, MPI_DOUBLE, MPI_$2, 0, comm)
+    $(for:sigma and SUM)
+        MPI_Reduce(&sum2, &tf_$1, 1, MPI_DOUBLE, MPI_$2, 0, comm)
+    $(if:1)
+        # only for rank 0, but do it collective for simplicity
+        tf_avg /= gsize
+        tf_sigma = sqrt(tf_sigma / gsize)
+
+# Barrier latency is measured cumulatively (as p2p_latency)
+fncode: bench_barrier(comm)
+    $(set:WARM_UP_NUM_BEST=20)
+    $local int iter
+    &call coll_warmup
+        &call measure, iter
+            MPI_Barrier(comm)
+
+    &call run_stat, NUM_REPEAT, tf_latency
+        &call measure, iter
+            MPI_Barrier(comm)
+        tf_latency = (tf_dur / iter)
+
+    $if grank == 0
+        printf("Barrier latency %.3f +/- %.3f us\n", sum1 * 1e6, sum2 * 1e6)
+    return sum1
+
+subcode: header_coll_latency
+    printf("%12s %8s %8s %8s     %6s  (in microseconds)\n", "msgsize", "min", "max", "avg", "sigma")
+
+subcode: report_coll_latency(MSGSIZE)
+    printf("%12d %8.3f %8.3f %8.3f     %6.3f\n", $(MSGSIZE), tf_min*1e6, tf_max*1e6, tf_avg*1e6, tf_sigma*1e6)
+
diff --git a/test/mpi/bench/macros/bench_frame.def b/test/mpi/bench/macros/bench_frame.def
@@ -20,7 +20,7 @@ subcode: bench_frame
         $(else)
             MPI_Init(NULL, NULL);
 
-        $my grank, gsize: int
+        $global grank, gsize: int
         MPI_Comm_rank(MPI_COMM_WORLD, &grank);
         MPI_Comm_size(MPI_COMM_WORLD, &gsize);
         $(if:MIN_PROCS)
@@ -54,21 +54,28 @@ subcode: bench_frame
 macros:
     use_double: 1
 
+macros:
+    MAX_BUFSIZE: 5000000  # 5 MB
+
 #----------------------------------------
 subcode: _autoload
     $register_prefix(comm) MPI_Comm
+    $define MAX_BUFSIZE 5000000
+    $define NUM_REPEAT 20
 
 subcode: foreach_size
-    $for int size = 0; size < $(MAX_MSG); size = (size==0)?1:size*2
+    $for int size = 0; size < $(MAX_BUFSIZE); size = (size==0)?1:size*2
         $(set:MSG_SIZE=size)
         BLOCK
 
+# measure tf_dur over iter
 subcode: measure(iter)
     tf_start = MPI_Wtime()
     $for 0:$(iter)
         BLOCK
     tf_dur = MPI_Wtime() - tf_start
 
+# repeat N times and calc avg in sum1 and std in sum2
 subcode: run_stat(N, var)
     $my double sum1=0, double sum2=0
     $for 0:$(N)
@@ -77,17 +84,28 @@ subcode: run_stat(N, var)
         sum2 += $(var) * $(var)
     sum1 /= $(N)
     sum2 /= $(N)
-    sum2 = sqrt(sum2 - sum1 * sum1)
+    $(if:RUN_STAT_VARIANCE)
+        sum2 = (sum2 - sum1 * sum1)
+    $(else)
+        sum2 = sqrt(sum2 - sum1 * sum1)
 
+# repeat until dur stabilize and iter adjusted to last minimum of 1ms
 subcode: warm_up(iter, dur)
-    $(set:MIN_ITER=(int) ($(iter) * 0.001 / $(dur)))
+    # minimum iteration to fill the duration to 1 ms
+    $(if:!WARM_UP_NUM_BEST)
+        $(set:NUM_BEST=10)
+    $(else)
+        $(set:NUM_BEST=$(WARM_UP_NUM_BEST))
+    $(if:!MIN_ITER)
+        $(set:MIN_ITER=(int) ($(iter) * 0.001 / $(dur)))
     $(iter) = 2
     $my double last_dur = 1.0
     $my int num_best = 0
-    $while num_best < 10
+    $while num_best < $(NUM_BEST)
         BLOCK
-        $if $(iter) < $(MIN_ITER)
-            $(iter) = $(MIN_ITER)
+        $my int min_iter = $(MIN_ITER)
+        $if $(iter) < 10000 && $(iter) < min_iter
+            $(iter) = min_iter
             num_best = 0
             continue
         # check that t_dur is no longer monotonically decreasing

diff --git a/test/mpi/bench/macros/bench_p2p.def b/test/mpi/bench/macros/bench_p2p.def
@@ -16,7 +16,7 @@
 
 macros:
     MIN_PROCS: 2
-    MAX_BUFSIZE: 5000000  # 5 MB
+    MEM_TYPES: sendrecv
 
 subcode: _autoload
     $register_name(src) int
@@ -25,8 +25,6 @@ subcode: _autoload
     $register_name(size) int
     $define TAG 0
     $define SYNC_TAG 100
-    $define MAX_BUFSIZE 5000000
-    $define NUM_REPEAT 20
 
 subcode: report_header
         $call header_latency

diff --git a/test/mpi/bench/macros/mtest.def b/test/mpi/bench/macros/mtest.def
@@ -3,12 +3,19 @@ macros:
 
 subcode: mtest_malloc(size)
     MTestArgList *head = MTestArgListCreate(argc, argv)
-    int send_rank = 0, recv_rank = 1;
-    $(for:a in send,recv)
-        $if grank == $(a)_rank
-            $my mtest_mem_type_e $(a)_memtype, int $(a)_device
-            $(a)_memtype = MTestArgListGetMemType(head, "$(a)mem")
-            $(a)_device = MTestArgListGetInt_with_default(head, "$(a)dev", 0)
-            MTestMalloc($(size), $(a)_memtype, NULL, &buf, $(a)_device)
-            MTestPrintfMsg(1, "Allocating buffer: memtype=%s, device=%d, size=%d\n", MTest_memtype_name($(a)_memtype), $(a)_device, $(size))
+    $(if:MEM_TYPES=sendrecv)
+        int send_rank = 0, recv_rank = 1;
+        $(for:a in send,recv)
+            $if grank == $(a)_rank
+                $call alloc_mem_dev, $(a)mem, $(a)dev
+    $(else)
+        # all procs allocating the same memory types
+        $call alloc_mem_dev, memtype, device
     MTestArgListDestroy(head)
+
+    subcode: alloc_mem_dev(memtype, memdev) # memtype and memdev are parameter names
+        $my mtest_mem_type_e memtype, int device
+        memtype = MTestArgListGetMemType(head, "$(memtype)")
+        device = MTestArgListGetInt_with_default(head, "$(memdev)", grank)
+        MTestMalloc($(size), memtype, NULL, &buf, device)
+        MTestPrintfMsg(1, "Allocating buffer: memtype=%s, device=%d, size=%d\n", MTest_memtype_name(memtype), device, $(size))
diff --git a/test/mpi/bench/p2p_bw.def b/test/mpi/bench/p2p_bw.def
@@ -9,7 +9,7 @@ page: p2p_bw, bench_frame
     MULTIPLICITY: WINDOW_SIZE
     data: buf, size, MPI_CHAR
 
-    $for int size = 1; size < MAX_BUFSIZE; size *= 2
+    &call foreach_size
         bench_p2p(comm, 0, 1, buf, size)
 
     subcode: send_side

diff --git a/test/mpi/bench/p2p_latency.def b/test/mpi/bench/p2p_latency.def
@@ -6,8 +6,7 @@ page: p2p_latency, bench_frame
     MULTIPLICITY: 2
     data: buf, size, MPI_CHAR
 
-    bench_p2p(comm, 0, 1, buf, 0)
-    $for int size = 1; size < MAX_BUFSIZE; size *= 2
+    &call foreach_size
         bench_p2p(comm, 0, 1, buf, size)
 
     subcode: send_side

diff --git a/test/mpi/bench/testlist b/test/mpi/bench/testlist
@@ -1,2 +1,3 @@
 p2p_latency 2 resultTest=TestBench
 p2p_bw 2 resultTest=TestBench
+bcast 16 resultTest=TestBench