Skip to content

Commit

Permalink
TL/CUDA: small refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
ikryukov committed Feb 4, 2025
1 parent 5f44941 commit 748c86c
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 13 deletions.
3 changes: 1 addition & 2 deletions src/components/tl/cuda/bcast/bcast_linear.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ ucc_tl_cuda_bcast_linear_setup_start(ucc_tl_cuda_task_t *task)
set_rank_step(task, trank, 0, 0); // Initialize rank step tracking
ucc_memory_cpu_store_fence();
// initiate barrier wait while all ranks set theirs steps to 0
return ucc_tl_cuda_shm_barrier_start(UCC_TL_TEAM_RANK(team), task->bar);
return ucc_tl_cuda_shm_barrier_start(trank, task->bar);
}

// Tests if setup is complete for a linear broadcast task
Expand Down Expand Up @@ -258,7 +258,6 @@ static void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
return;
}
}
task->bcast_linear.stage = STAGE_COPY;
if (task->bcast_linear.step < task->bcast_linear.num_steps) {
// go to next iteration
task->bcast_linear.stage = STAGE_COPY;
Expand Down
25 changes: 14 additions & 11 deletions src/components/tl/cuda/tl_cuda_coll.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,13 @@ static inline void ucc_tl_cuda_task_put(ucc_tl_cuda_task_t *task)
ucc_mpool_put(task);
}

static inline uint64_t compute_key(ucc_rank_t root, ucc_rank_t peer, uint16_t tag)
{
assert(peer < (1 << 24));
assert(root < (1 << 24));
return (uint64_t)tag << 48 | root << 24 | peer;
}

static inline
ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args,
ucc_tl_cuda_team_t *team,
Expand All @@ -95,6 +102,7 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args,
ucc_rank_t trank = UCC_TL_TEAM_RANK(team);
ucc_tl_cuda_lib_t *lib = UCC_TL_CUDA_TEAM_LIB(team);
uint32_t max_concurrent = lib->cfg.max_concurrent;
ucc_rank_t peer;
ucc_tl_cuda_task_t *task;
ucc_status_t status;

Expand All @@ -118,16 +126,10 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args,
ucc_assert(coll_args->args.coll_type == UCC_COLL_TYPE_BCAST);
task->subset.map = ucc_active_set_to_ep_map(&coll_args->args);
task->subset.myrank = UCC_TL_TEAM_RANK(team);
// root
if (task->subset.myrank == coll_args->args.root) {
int peer = ucc_ep_map_eval(task->subset.map, 1);
task->bcast_linear.key = ((uint64_t)coll_args->args.tag << 32 |
coll_args->args.root << 16 | peer);
} else {
task->bcast_linear.key =
((uint64_t)coll_args->args.tag << 32 |
coll_args->args.root << 16 | task->subset.myrank);
}
// currently we support only active set bacst with 2 ranks
// so root rank should remap phys rank of peer with rank 1
peer = (task->subset.myrank == coll_args->args.root) ? ucc_ep_map_eval(task->subset.map, 1) : task->subset.myrank;
task->bcast_linear.key = compute_key(coll_args->args.root, peer, coll_args->args.tag);
task->seq_num = team->seq_num_active_set++;
} else {
task->seq_num = team->seq_num++;
Expand All @@ -139,7 +141,8 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args,
return UCC_OK;
}

// check if segment for current task is available and barrier is available (completed from prev iteration)
// check if segment for current task is available and barrier is available (completed from prev iteration)
// and possibly mark the segment as occupied by updating the state counter to the current seq_num
static inline ucc_status_t ucc_tl_cuda_get_sync_root(ucc_tl_cuda_task_t *task, ucc_rank_t root)
{
ucc_tl_cuda_team_t *team = TASK_TEAM(task);
Expand Down

0 comments on commit 748c86c

Please sign in to comment.