Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Topic/improved gpu support #120

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion parsec
Submodule parsec updated 113 files
12 changes: 6 additions & 6 deletions src/dplasmajdf_lapack_dtt.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,12 +150,12 @@ stage_in_lapack(parsec_gpu_task_t *gtask,
assert(ddc != NULL);
elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype);
in_elem_dev = (parsec_device_cuda_module_t*)parsec_mca_device_get( copy_in->device_index);
if( (in_elem_dev->super.super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){
if( (in_elem_dev->super.super.type & PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){
ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private,
copy_in->device_private,
gtask->flow_nb_elts[i],
(in_elem_dev->super.super.type != PARSEC_DEV_CUDA)?
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice,
(in_elem_dev->super.super.type & PARSEC_DEV_CUDA)?
cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice,
cuda_stream->cuda_stream);
PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } );
}else{
Expand Down Expand Up @@ -227,12 +227,12 @@ stage_out_lapack(parsec_gpu_task_t *gtask,
elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype);
out_elem_dev = (parsec_device_cuda_module_t*)parsec_mca_device_get( copy_out->device_index);

if( (out_elem_dev->super.super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){
if( (out_elem_dev->super.super.type & PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){
ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private,
copy_in->device_private,
gtask->flow_nb_elts[i],
out_elem_dev->super.super.type != PARSEC_DEV_CUDA ?
cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice,
out_elem_dev->super.super.type & PARSEC_DEV_CUDA ?
cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost,
cuda_stream->cuda_stream);
PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } );
}else{
Expand Down
2 changes: 1 addition & 1 deletion src/zgemm_NN_gpu.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ BODY [type=CUDA]
__parsec_zgemm_NN_gpu_GEMM_task_t next_gemm;
memcpy(&next_gemm, this_task, sizeof(__parsec_zgemm_NN_gpu_GEMM_task_t));
next_gemm.locals.k.value = descC->mt -1;
assert( PARSEC_DEV_CUDA == next_gemm.task_class->incarnations[chore_id].type );
assert( PARSEC_DEV_CUDA & next_gemm.task_class->incarnations[chore_id].type );
if(NULL != next_gemm.task_class->incarnations[chore_id].evaluate) {
if( next_gemm.task_class->incarnations[chore_id].evaluate((parsec_task_t*)&next_gemm) ==
PARSEC_HOOK_RETURN_NEXT ) {
Expand Down
6 changes: 3 additions & 3 deletions src/zgemm_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB,
nbgpu = 0;
for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
parsec_device_module_t *device = parsec_mca_device_get(dev);
if( PARSEC_DEV_CUDA == device->type ) {
if( PARSEC_DEV_CUDA & device->type ) {
parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)device;
nbgpu++;
if( 0 == gpu_mem_block_size )
Expand All @@ -239,7 +239,7 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB,
nbgpu= 0;
for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
parsec_device_module_t *device = parsec_mca_device_get(dev);
if( PARSEC_DEV_CUDA == device->type ) {
if( PARSEC_DEV_CUDA & device->type ) {
dev_index[nbgpu++] = device->device_index;
}
}
Expand Down Expand Up @@ -461,7 +461,7 @@ dplasma_zgemm_New_ex( dplasma_enum_t transA, dplasma_enum_t transB,
int64_t gpu_mem_nb_blocks = -1;
for(devid = 0; devid < (int)parsec_nb_devices; devid++) {
parsec_device_module_t *device = parsec_mca_device_get(devid);
if( PARSEC_DEV_CUDA == device->type ) {
if( PARSEC_DEV_CUDA & device->type ) {
parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)device;
nb_gpu_devices++;
if( 0 == gpu_mem_block_size )
Expand Down
1 change: 1 addition & 0 deletions src/zpotrf_U.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ BODY [type=CUDA]
handles = parsec_info_get(&gpu_stream->infos, CuHandlesID);
assert(NULL != handles);
wp = parsec_info_get(&gpu_device->super.infos, POWorkspaceID);
if( NULL == wp ) return PARSEC_HOOK_RETURN_AGAIN;
assert(NULL != wp);

workspace = (cuDoubleComplex*)wp->tmpmem;
Expand Down
28 changes: 24 additions & 4 deletions src/zpotrf_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,11 @@ dplasma_zpotrf_setrecursive( parsec_taskpool_t *tp, int hmb )
}
}

#define USE_PARSEC_ZONE_FOR_WP_MEMORY 0

#if defined(DPLASMA_HAVE_CUDA)
void *zpotrf_create_workspace(void *obj, void *user)
{
parsec_device_module_t *mod = (parsec_device_module_t *)obj;
zone_malloc_t *memory = ((parsec_device_cuda_module_t*)mod)->super.memory;
cusolverDnHandle_t cusolverDnHandle;
cusolverStatus_t status;
parsec_zpotrf_U_taskpool_t *tp = (parsec_zpotrf_U_taskpool_t*)user;
Expand All @@ -73,6 +73,7 @@ void *zpotrf_create_workspace(void *obj, void *user)
size_t elt_size = sizeof(cuDoubleComplex);
cublasFillMode_t cublas_uplo;
dplasma_enum_t uplo = tp->_g_uplo;
void* tmpmem;

if( PlasmaLower == uplo )
cublas_uplo = CUBLAS_FILL_MODE_LOWER;
Expand All @@ -87,20 +88,39 @@ void *zpotrf_create_workspace(void *obj, void *user)
assert(CUSOLVER_STATUS_SUCCESS == status);

cusolverDnDestroy(cusolverDnHandle);

#if USE_PARSEC_ZONE_FOR_WP_MEMORY
parsec_device_module_t *mod = (parsec_device_module_t *)obj;
zone_malloc_t *memory = ((parsec_device_cuda_module_t*)mod)->super.memory;
tmpmem = zone_malloc(memory, workspace_size * elt_size + sizeof(int));
if( NULL == tmpmem )
return NULL;
#else
(void)obj;
cudaError_t rc = cudaMalloc(&tmpmem, workspace_size * elt_size + sizeof(int));
if( cudaSuccess != rc )
return NULL;
#endif /* USE_PARSEC_ZONE_FOR_WP_MEMORY */
wp = (dplasma_potrf_workspace_t*)malloc(sizeof(dplasma_potrf_workspace_t));
wp->tmpmem = zone_malloc(memory, workspace_size * elt_size + sizeof(int));
wp->tmpmem = tmpmem;
assert(NULL != wp->tmpmem);
wp->lwork = workspace_size;
#if USE_PARSEC_ZONE_FOR_WP_MEMORY
wp->memory = memory;
#else
wp->memory = NULL;
#endif /* USE_PARSEC_ZONE_FOR_WP_MEMORY */

return wp;
}

static void destroy_workspace(void *_ws, void *_n)
{
dplasma_potrf_workspace_t *ws = (dplasma_potrf_workspace_t*)_ws;
#if USE_PARSEC_ZONE_FOR_WP_MEMORY
zone_free((zone_malloc_t*)ws->memory, ws->tmpmem);
#else
(void)cudaFree(ws->tmpmem);
#endif /* USE_PARSEC_ZONE_FOR_WP_MEMORY */
free(ws);
(void)_n;
}
Expand Down
52 changes: 42 additions & 10 deletions tests/common.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ void print_usage(void)
"\n"
" -c --cores : number of concurent threads (default: number of physical hyper-threads)\n"
" -g --gpus : number of GPU (default: 0)\n"
" -D --gpu_mask : mask of the GPUs to be used by this process either a positive number to indicate the GPU mask or -1 (to use the GPUs module the local rank of the process)\n"
" -m --thread_multi : initialize MPI_THREAD_MULTIPLE (default: no)\n"
" -o --scheduler : select the scheduler (default: LFQ)\n"
" Accepted values:\n"
Expand Down Expand Up @@ -181,6 +182,7 @@ static struct option long_options[] =
{"scheduler", required_argument, 0, 'o'},
{"gpus", required_argument, 0, 'g'},
{"g", required_argument, 0, 'g'},
{"D", required_argument, 0, 'D'},
{"V", required_argument, 0, 'V'},
{"vpmap", required_argument, 0, 'V'},
{"ht", required_argument, 0, 'H'},
Expand Down Expand Up @@ -324,6 +326,16 @@ static void read_arguments(int *_argc, char*** _argv, int* iparam)
}
iparam[IPARAM_NGPUS] = atoi(optarg);
break;
case 'D':
#if !defined(DPLASMA_HAVE_CUDA)
iparam[IPARAM_GPU_MASK] = DPLASMA_ERR_NOT_SUPPORTED; /* force an error message */
#endif
if(iparam[IPARAM_GPU_MASK] == DPLASMA_ERR_NOT_SUPPORTED) {
fprintf(stderr, "#!!!!! This test does not have GPU support. GPU disabled.\n");
break;
}
iparam[IPARAM_GPU_MASK] = atoi(optarg);
break;

case 'p': case 'P': iparam[IPARAM_P] = atoi(optarg); break;
case 'q': case 'Q': iparam[IPARAM_Q] = atoi(optarg); break;
Expand Down Expand Up @@ -448,7 +460,6 @@ static void parse_arguments(int *iparam) {
parsec_setenv_mca_param( "device_cuda_enabled", value, &environ );
free(value);
}

/* Check the process grid */
if(0 == iparam[IPARAM_P])
iparam[IPARAM_P] = iparam[IPARAM_NNODES];
Expand Down Expand Up @@ -510,6 +521,25 @@ static void parse_arguments(int *iparam) {
/* HQR */
if(-'P' == iparam[IPARAM_QR_HLVL_SZE]) iparam[IPARAM_QR_HLVL_SZE] = iparam[IPARAM_P];
if(-'Q' == iparam[IPARAM_QR_HLVL_SZE]) iparam[IPARAM_QR_HLVL_SZE] = iparam[IPARAM_Q];
#if defined(DPLASMA_HAVE_CUDA) && defined(PARSEC_HAVE_MPI)
if(iparam[IPARAM_NGPUS] > 0 && (-1 == iparam[IPARAM_GPU_MASK])) {
MPI_Comm local_comm;
int local_rank, local_size;
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
MPI_INFO_NULL, &local_comm);
MPI_Comm_rank(local_comm, &local_rank);
MPI_Comm_size(local_comm, &local_size);
MPI_Comm_free(&local_comm);
iparam[IPARAM_GPU_MASK] = 0;
for( int i = 0; i <= iparam[IPARAM_NGPUS]; i++ ) {
iparam[IPARAM_GPU_MASK] |= ((1 << local_rank) << i);
}
char* value;
asprintf(&value, "%d", iparam[IPARAM_GPU_MASK]);
parsec_setenv_mca_param("device_cuda_mask", value, &environ);
free(value); value = NULL;
}
#endif /* defined(DPLASMA_HAVE_CUDA) && defined(PARSEC_HAVE_MPI) */
}

static void print_arguments(int* iparam)
Expand Down Expand Up @@ -698,21 +728,23 @@ parsec_context_t* setup_parsec(int argc, char **argv, int *iparam)
int dev, nbgpu = 0;
for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
parsec_device_module_t *device = parsec_mca_device_get(dev);
if( PARSEC_DEV_CUDA == device->type ) {
if( PARSEC_DEV_CUDA & device->type ) {
nbgpu++;
}
}
if( nbgpu > 0 ) {
if( iparam[IPARAM_NGPUS] < 0 ) {
iparam[IPARAM_NGPUS] = nbgpu;
}
if( iparam[IPARAM_NGPUS] > 0 ) {
if(iparam[IPARAM_VERBOSE] >= 3) {
parsec_setenv_mca_param( "device_show_statistics", "1", &environ );
}
CuHI = parsec_info_register(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES",
dplasma_destroy_cuda_handles, NULL,
dplasma_create_cuda_handles, NULL,
NULL);
dplasma_destroy_cuda_handles, NULL,
dplasma_create_cuda_handles, NULL,
NULL);
assert(-1 != CuHI);
}
iparam[IPARAM_NGPUS] = nbgpu;
if(iparam[IPARAM_NGPUS] > 0 && iparam[IPARAM_VERBOSE] >= 3) {
parsec_setenv_mca_param( "device_show_statistics", "1", &environ );
}
#endif

print_arguments(iparam);
Expand Down
1 change: 1 addition & 0 deletions tests/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ enum iparam_t {
IPARAM_BUT_LEVEL, /* Butterfly level */
IPARAM_SCHEDULER, /* User-selected scheduler */
IPARAM_NRUNS, /* Number of times to run the kernel */
IPARAM_GPU_MASK, /* The mask of GPU to be used */
IPARAM_SIZEOF
};

Expand Down
4 changes: 2 additions & 2 deletions tests/testing_zgelqf.c
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ static void warmup_zgelqf(int rank, int random_seed, parsec_context_t *parsec)
for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
for(int i = 0; i < (int)zgelqf->nb_task_classes; i++) {
for(int j = 0; NULL != zgelqf->task_classes_array[i]->incarnations[j].hook; j++) {
if( zgelqf->task_classes_array[i]->incarnations[j].type == dtype ) {
if( zgelqf->task_classes_array[i]->incarnations[j].type & dtype ) {
goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
}
}
Expand All @@ -426,7 +426,7 @@ static void warmup_zgelqf(int rank, int random_seed, parsec_context_t *parsec)
do_run:
for(int did = 0; did < (int)parsec_nb_devices; did++) {
parsec_device_module_t *dev = parsec_mca_device_get(did);
if(dev->type != dtype)
if( !(dev->type & dtype) )
continue;
/* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
for(int m = 0; m < MT; m++) {
Expand Down
4 changes: 2 additions & 2 deletions tests/testing_zgelqf_hqr.c
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ static void warmup_hqr(parsec_context_t *parsec, int *iparam)
for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
for(int i = 0; i < (int)zgelqf_param_tp->nb_task_classes; i++) {
for(int j = 0; NULL != zgelqf_param_tp->task_classes_array[i]->incarnations[j].hook; j++) {
if( zgelqf_param_tp->task_classes_array[i]->incarnations[j].type == dtype ) {
if( zgelqf_param_tp->task_classes_array[i]->incarnations[j].type & dtype ) {
goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
}
}
Expand All @@ -480,7 +480,7 @@ static void warmup_hqr(parsec_context_t *parsec, int *iparam)
do_run:
for(int did = 0; did < (int)parsec_nb_devices; did++) {
parsec_device_module_t *dev = parsec_mca_device_get(did);
if(dev->type != dtype)
if( !(dev->type & dtype) )
continue;
/* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
for(int m = 0; m < MT; m++) {
Expand Down
4 changes: 2 additions & 2 deletions tests/testing_zgelqf_systolic.c
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,7 @@ static void warmup_hqr(parsec_context_t *parsec, int *iparam)
for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
for(int i = 0; i < (int)zgelqf_sys_tp->nb_task_classes; i++) {
for(int j = 0; NULL != zgelqf_sys_tp->task_classes_array[i]->incarnations[j].hook; j++) {
if( zgelqf_sys_tp->task_classes_array[i]->incarnations[j].type == dtype ) {
if( zgelqf_sys_tp->task_classes_array[i]->incarnations[j].type & dtype ) {
goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
}
}
Expand All @@ -474,7 +474,7 @@ static void warmup_hqr(parsec_context_t *parsec, int *iparam)
do_run:
for(int did = 0; did < (int)parsec_nb_devices; did++) {
parsec_device_module_t *dev = parsec_mca_device_get(did);
if(dev->type != dtype)
if( !(dev->type & dtype) )
continue;
/* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
for(int m = 0; m < MT; m++) {
Expand Down
4 changes: 2 additions & 2 deletions tests/testing_zgeqrf_hqr.c
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ static void warmup_zgeqrf_hqr(int rank, int random_seed, int *iparam, parsec_con
for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
for(int i = 0; i < (int)zgeqrf_hqr->nb_task_classes; i++) {
for(int j = 0; NULL != zgeqrf_hqr->task_classes_array[i]->incarnations[j].hook; j++) {
if( zgeqrf_hqr->task_classes_array[i]->incarnations[j].type == dtype ) {
if( zgeqrf_hqr->task_classes_array[i]->incarnations[j].type & dtype ) {
goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
}
}
Expand All @@ -480,7 +480,7 @@ static void warmup_zgeqrf_hqr(int rank, int random_seed, int *iparam, parsec_con
do_run:
for(int did = 0; did < (int)parsec_nb_devices; did++) {
parsec_device_module_t *dev = parsec_mca_device_get(did);
if(dev->type != dtype)
if( !(dev->type & dtype) )
continue;
/* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
for(int m = 0; m < MT; m++) {
Expand Down
4 changes: 2 additions & 2 deletions tests/testing_zgeqrf_systolic.c
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@ static void warmup_zgeqrf_systolic(int rank, int random_seed, int *iparam, parse
for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
for(int i = 0; i < (int)zgeqrf_systolic->nb_task_classes; i++) {
for(int j = 0; NULL != zgeqrf_systolic->task_classes_array[i]->incarnations[j].hook; j++) {
if( zgeqrf_systolic->task_classes_array[i]->incarnations[j].type == dtype ) {
if( zgeqrf_systolic->task_classes_array[i]->incarnations[j].type & dtype ) {
goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
}
}
Expand All @@ -466,7 +466,7 @@ static void warmup_zgeqrf_systolic(int rank, int random_seed, int *iparam, parse
do_run:
for(int did = 0; did < (int)parsec_nb_devices; did++) {
parsec_device_module_t *dev = parsec_mca_device_get(did);
if(dev->type != dtype)
if( !(dev->type & dtype) )
continue;
/* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
for(int m = 0; m < MT; m++) {
Expand Down
4 changes: 2 additions & 2 deletions tests/testing_zgesvd.c
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ static void warmup_zgesvd(int rank, int random_seed, parsec_context_t *parsec)
for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
for(int i = 0; i < (int)zgesvd->nb_task_classes; i++) {
for(int j = 0; NULL != zgesvd->task_classes_array[i]->incarnations[j].hook; j++) {
if( zgesvd->task_classes_array[i]->incarnations[j].type == dtype ) {
if( zgesvd->task_classes_array[i]->incarnations[j].type & dtype ) {
goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
}
}
Expand All @@ -339,7 +339,7 @@ static void warmup_zgesvd(int rank, int random_seed, parsec_context_t *parsec)
do_run:
for(int did = 0; did < (int)parsec_nb_devices; did++) {
parsec_device_module_t *dev = parsec_mca_device_get(did);
if(dev->type != dtype)
if( !(dev->type & dtype) )
continue;
/* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
for(int m = 0; m < MT; m++) {
Expand Down
Loading
Loading