Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test/bench: add bcast benchmark #7157

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions test/mpi/bench/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/*.c
/p2p_bw
/p2p_latency
/bcast
3 changes: 2 additions & 1 deletion test/mpi/bench/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ LDADD += -lm
## correctly
noinst_PROGRAMS = \
p2p_latency \
p2p_bw
p2p_bw \
bcast

.def.c:
mydef_page $<
6 changes: 6 additions & 0 deletions test/mpi/bench/barrier.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
include: macros/bench_frame.def
include: macros/bench_coll.def

page: barrier, bench_frame
$for 0:10
bench_barrier(comm)
32 changes: 32 additions & 0 deletions test/mpi/bench/bcast.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
include: macros/bench_frame.def
include: macros/bench_coll.def
include: macros/mtest.def

page: bcast, bench_frame
data: buf, size, MPI_CHAR

$global root=0
$(if:0)
&call measure_with_barrier
MPI_Bcast($(data), root, comm)
$(else)
$if grank == 0
$call header_coll_latency
&call foreach_size
$my tf_min, tf_max, tf_avg, tf_sigma
$(set:MIN_ITER=0.001/tf_max)
&call coll_warmup
measure_bcast(iter, root, comm, buf, size, &tf_min, &tf_max, &tf_avg, &tf_sigma)
tf_dur = tf_max
$if iter < 100
iter = 100
measure_bcast(iter, root, comm, buf, size, &tf_min, &tf_max, &tf_avg, &tf_sigma)
$if grank == 0
$call report_coll_latency, size

fncode: measure_bcast(int iter, int root, comm, buf, size, pf_min, pf_max, pf_avg, pf_sigma)
&call measure_coll_latency, iter
MPI_Bcast($(data), root, comm)
$(for:min,max,avg,sigma)
*pf_$1 = tf_$1

77 changes: 77 additions & 0 deletions test/mpi/bench/macros/bench_coll.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
subcode: coll_warmup
$if grank == 0
&call warm_up, iter, tf_dur
MPI_Bcast(&iter, 1, MPI_INT, 0, comm)
BLOCK
# $dump tf_dur, iter, num_best
$my tn_zero = 0
MPI_Bcast(&tn_zero, 1, MPI_INT, 0, comm)
$else
$while 1
MPI_Bcast(&iter, 1, MPI_INT, 0, comm)
$if iter == 0
break
BLOCK

MPI_Bcast(&iter, 1, MPI_INT, 0, comm)

subcode: measure_with_barrier
tf_barrier = bench_barrier(comm)
$if grank == 0
$call header_latency
&call foreach_size
&call coll_warmup
&call measure, iter
BLOCK
MPI_Barrier(comm)
&call run_stat, NUM_REPEAT, tf_latency
&call measure, iter
BLOCK
MPI_Barrier(comm)
tf_latency = (tf_dur / iter) - tf_barrier
$if grank == 0
$call report_latency, size, 1

# Measure individual latency after a barrier (as osu_bcast)
subcode: measure_coll_latency(iter)
$my tf_max, tf_min, tf_avg, tf_sigma # output variables

$(set:RUN_STAT_VARIANCE=1)
&call run_stat, $(iter), tf_latency
MPI_Barrier(comm)
tf_start = MPI_Wtime()
BLOCK
tf_latency = (MPI_Wtime() - tf_start)

$(for:max,min,avg and MAX,MIN,SUM)
MPI_Reduce(&sum1, &tf_$1, 1, MPI_DOUBLE, MPI_$2, 0, comm)
$(for:sigma and SUM)
MPI_Reduce(&sum2, &tf_$1, 1, MPI_DOUBLE, MPI_$2, 0, comm)
$(if:1)
# only for rank 0, but do it collective for simplicity
tf_avg /= gsize
tf_sigma = sqrt(tf_sigma / gsize)

# Barrier latency is measured cumulatively (as p2p_latency)
fncode: bench_barrier(comm)
$(set:WARM_UP_NUM_BEST=20)
$local int iter
&call coll_warmup
&call measure, iter
MPI_Barrier(comm)

&call run_stat, NUM_REPEAT, tf_latency
&call measure, iter
MPI_Barrier(comm)
tf_latency = (tf_dur / iter)

$if grank == 0
printf("Barrier latency %.3f +/- %.3f us\n", sum1 * 1e6, sum2 * 1e6)
return sum1

subcode: header_coll_latency
printf("%12s %8s %8s %8s %6s (in microseconds)\n", "msgsize", "min", "max", "avg", "sigma")

subcode: report_coll_latency(MSGSIZE)
printf("%12d %8.3f %8.3f %8.3f %6.3f\n", $(MSGSIZE), tf_min*1e6, tf_max*1e6, tf_avg*1e6, tf_sigma*1e6)

32 changes: 25 additions & 7 deletions test/mpi/bench/macros/bench_frame.def
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ subcode: bench_frame
$(else)
MPI_Init(NULL, NULL);

$my grank, gsize: int
$global grank, gsize: int
MPI_Comm_rank(MPI_COMM_WORLD, &grank);
MPI_Comm_size(MPI_COMM_WORLD, &gsize);
$(if:MIN_PROCS)
Expand Down Expand Up @@ -54,21 +54,28 @@ subcode: bench_frame
macros:
use_double: 1

macros:
MAX_BUFSIZE: 5000000 # 5 MB

#----------------------------------------
subcode: _autoload
$register_prefix(comm) MPI_Comm
$define MAX_BUFSIZE 5000000
$define NUM_REPEAT 20

subcode: foreach_size
$for int size = 0; size < $(MAX_MSG); size = (size==0)?1:size*2
$for int size = 0; size < $(MAX_BUFSIZE); size = (size==0)?1:size*2
$(set:MSG_SIZE=size)
BLOCK

# measure tf_dur over iter
subcode: measure(iter)
tf_start = MPI_Wtime()
$for 0:$(iter)
BLOCK
tf_dur = MPI_Wtime() - tf_start

# repeat N times and calc avg in sum1 and std in sum2
subcode: run_stat(N, var)
$my double sum1=0, double sum2=0
$for 0:$(N)
Expand All @@ -77,17 +84,28 @@ subcode: run_stat(N, var)
sum2 += $(var) * $(var)
sum1 /= $(N)
sum2 /= $(N)
sum2 = sqrt(sum2 - sum1 * sum1)
$(if:RUN_STAT_VARIANCE)
sum2 = (sum2 - sum1 * sum1)
$(else)
sum2 = sqrt(sum2 - sum1 * sum1)

# repeat until dur stabilize and iter adjusted to last minimum of 1ms
subcode: warm_up(iter, dur)
$(set:MIN_ITER=(int) ($(iter) * 0.001 / $(dur)))
# minimum iteration to fill the duration to 1 ms
$(if:!WARM_UP_NUM_BEST)
$(set:NUM_BEST=10)
$(else)
$(set:NUM_BEST=$(WARM_UP_NUM_BEST))
$(if:!MIN_ITER)
$(set:MIN_ITER=(int) ($(iter) * 0.001 / $(dur)))
$(iter) = 2
$my double last_dur = 1.0
$my int num_best = 0
$while num_best < 10
$while num_best < $(NUM_BEST)
BLOCK
$if $(iter) < $(MIN_ITER)
$(iter) = $(MIN_ITER)
$my int min_iter = $(MIN_ITER)
$if $(iter) < 10000 && $(iter) < min_iter
$(iter) = min_iter
num_best = 0
continue
# check that t_dur is no longer monotonically decreasing
Expand Down
4 changes: 1 addition & 3 deletions test/mpi/bench/macros/bench_p2p.def
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

macros:
MIN_PROCS: 2
MAX_BUFSIZE: 5000000 # 5 MB
MEM_TYPES: sendrecv

subcode: _autoload
$register_name(src) int
Expand All @@ -25,8 +25,6 @@ subcode: _autoload
$register_name(size) int
$define TAG 0
$define SYNC_TAG 100
$define MAX_BUFSIZE 5000000
$define NUM_REPEAT 20

subcode: report_header
$call header_latency
Expand Down
23 changes: 15 additions & 8 deletions test/mpi/bench/macros/mtest.def
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,19 @@ macros:

subcode: mtest_malloc(size)
MTestArgList *head = MTestArgListCreate(argc, argv)
int send_rank = 0, recv_rank = 1;
$(for:a in send,recv)
$if grank == $(a)_rank
$my mtest_mem_type_e $(a)_memtype, int $(a)_device
$(a)_memtype = MTestArgListGetMemType(head, "$(a)mem")
$(a)_device = MTestArgListGetInt_with_default(head, "$(a)dev", 0)
MTestMalloc($(size), $(a)_memtype, NULL, &buf, $(a)_device)
MTestPrintfMsg(1, "Allocating buffer: memtype=%s, device=%d, size=%d\n", MTest_memtype_name($(a)_memtype), $(a)_device, $(size))
$(if:MEM_TYPES=sendrecv)
int send_rank = 0, recv_rank = 1;
$(for:a in send,recv)
$if grank == $(a)_rank
$call alloc_mem_dev, $(a)mem, $(a)dev
$(else)
# all procs allocating the same memory types
$call alloc_mem_dev, memtype, device
MTestArgListDestroy(head)

subcode: alloc_mem_dev(memtype, memdev) # memtype and memdev are parameter names
$my mtest_mem_type_e memtype, int device
memtype = MTestArgListGetMemType(head, "$(memtype)")
device = MTestArgListGetInt_with_default(head, "$(memdev)", grank)
MTestMalloc($(size), memtype, NULL, &buf, device)
MTestPrintfMsg(1, "Allocating buffer: memtype=%s, device=%d, size=%d\n", MTest_memtype_name(memtype), device, $(size))
2 changes: 1 addition & 1 deletion test/mpi/bench/p2p_bw.def
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ page: p2p_bw, bench_frame
MULTIPLICITY: WINDOW_SIZE
data: buf, size, MPI_CHAR

$for int size = 1; size < MAX_BUFSIZE; size *= 2
&call foreach_size
bench_p2p(comm, 0, 1, buf, size)

subcode: send_side
Expand Down
3 changes: 1 addition & 2 deletions test/mpi/bench/p2p_latency.def
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@ page: p2p_latency, bench_frame
MULTIPLICITY: 2
data: buf, size, MPI_CHAR

bench_p2p(comm, 0, 1, buf, 0)
$for int size = 1; size < MAX_BUFSIZE; size *= 2
&call foreach_size
bench_p2p(comm, 0, 1, buf, size)

subcode: send_side
Expand Down
1 change: 1 addition & 0 deletions test/mpi/bench/testlist
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
p2p_latency 2 resultTest=TestBench
p2p_bw 2 resultTest=TestBench
bcast 16 resultTest=TestBench