From 7655f876de176ad26ce39c62f6e214f07fc2295e Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 7 Aug 2024 22:41:33 -0700 Subject: [PATCH 1/3] Add support for GPU mask. Leave set to -1 to allow parsec to spread the node processes across all available GPUs, while respecting the number of requested GPUs. Set to some other value to force parsec to use a specific set of GPUs. Signed-off-by: George Bosilca --- tests/common.c | 52 ++++++++++++++++++++++++++++++++++++++++---------- tests/common.h | 1 + 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/tests/common.c b/tests/common.c index 4e390732..e981bfd8 100644 --- a/tests/common.c +++ b/tests/common.c @@ -124,6 +124,7 @@ void print_usage(void) "\n" " -c --cores : number of concurent threads (default: number of physical hyper-threads)\n" " -g --gpus : number of GPU (default: 0)\n" + " -D --gpu_mask : mask of the GPUs to be used by this process either a positive number to indicate the GPU mask or -1 (to use the GPUs module the local rank of the process)\n" " -m --thread_multi : initialize MPI_THREAD_MULTIPLE (default: no)\n" " -o --scheduler : select the scheduler (default: LFQ)\n" " Accepted values:\n" @@ -181,6 +182,7 @@ static struct option long_options[] = {"scheduler", required_argument, 0, 'o'}, {"gpus", required_argument, 0, 'g'}, {"g", required_argument, 0, 'g'}, + {"D", required_argument, 0, 'D'}, {"V", required_argument, 0, 'V'}, {"vpmap", required_argument, 0, 'V'}, {"ht", required_argument, 0, 'H'}, @@ -324,6 +326,16 @@ static void read_arguments(int *_argc, char*** _argv, int* iparam) } iparam[IPARAM_NGPUS] = atoi(optarg); break; + case 'D': +#if !defined(DPLASMA_HAVE_CUDA) + iparam[IPARAM_GPU_MASK] = DPLASMA_ERR_NOT_SUPPORTED; /* force an error message */ +#endif + if(iparam[IPARAM_GPU_MASK] == DPLASMA_ERR_NOT_SUPPORTED) { + fprintf(stderr, "#!!!!! This test does not have GPU support. GPU disabled.\n"); + break; + } + iparam[IPARAM_GPU_MASK] = atoi(optarg); + break; case 'p': case 'P': iparam[IPARAM_P] = atoi(optarg); break; case 'q': case 'Q': iparam[IPARAM_Q] = atoi(optarg); break; @@ -448,7 +460,6 @@ static void parse_arguments(int *iparam) { parsec_setenv_mca_param( "device_cuda_enabled", value, &environ ); free(value); } - /* Check the process grid */ if(0 == iparam[IPARAM_P]) iparam[IPARAM_P] = iparam[IPARAM_NNODES]; @@ -510,6 +521,25 @@ static void parse_arguments(int *iparam) { /* HQR */ if(-'P' == iparam[IPARAM_QR_HLVL_SZE]) iparam[IPARAM_QR_HLVL_SZE] = iparam[IPARAM_P]; if(-'Q' == iparam[IPARAM_QR_HLVL_SZE]) iparam[IPARAM_QR_HLVL_SZE] = iparam[IPARAM_Q]; +#if defined(DPLASMA_HAVE_CUDA) && defined(PARSEC_HAVE_MPI) + if(iparam[IPARAM_NGPUS] > 0 && (-1 == iparam[IPARAM_GPU_MASK])) { + MPI_Comm local_comm; + int local_rank, local_size; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, + MPI_INFO_NULL, &local_comm); + MPI_Comm_rank(local_comm, &local_rank); + MPI_Comm_size(local_comm, &local_size); + MPI_Comm_free(&local_comm); + iparam[IPARAM_GPU_MASK] = 0; + for( int i = 0; i <= iparam[IPARAM_NGPUS]; i++ ) { + iparam[IPARAM_GPU_MASK] |= ((1 << local_rank) << i); + } + char* value; + asprintf(&value, "%d", iparam[IPARAM_GPU_MASK]); + parsec_setenv_mca_param("device_cuda_mask", value, &environ); + free(value); value = NULL; + } +#endif /* defined(DPLASMA_HAVE_CUDA) && defined(PARSEC_HAVE_MPI) */ } static void print_arguments(int* iparam) @@ -698,21 +728,23 @@ parsec_context_t* setup_parsec(int argc, char **argv, int *iparam) int dev, nbgpu = 0; for(dev = 0; dev < (int)parsec_nb_devices; dev++) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { nbgpu++; } } - if( nbgpu > 0 ) { + if( iparam[IPARAM_NGPUS] < 0 ) { + iparam[IPARAM_NGPUS] = nbgpu; + } + if( iparam[IPARAM_NGPUS] > 0 ) { + if(iparam[IPARAM_VERBOSE] >= 3) { + parsec_setenv_mca_param( "device_show_statistics", "1", &environ ); + } CuHI = parsec_info_register(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", - dplasma_destroy_cuda_handles, NULL, - dplasma_create_cuda_handles, NULL, - NULL); + dplasma_destroy_cuda_handles, NULL, + dplasma_create_cuda_handles, NULL, + NULL); assert(-1 != CuHI); } - iparam[IPARAM_NGPUS] = nbgpu; - if(iparam[IPARAM_NGPUS] > 0 && iparam[IPARAM_VERBOSE] >= 3) { - parsec_setenv_mca_param( "device_show_statistics", "1", &environ ); - } #endif print_arguments(iparam); diff --git a/tests/common.h b/tests/common.h index f420329e..c4b15253 100644 --- a/tests/common.h +++ b/tests/common.h @@ -74,6 +74,7 @@ enum iparam_t { IPARAM_BUT_LEVEL, /* Butterfly level */ IPARAM_SCHEDULER, /* User-selected scheduler */ IPARAM_NRUNS, /* Number of times to run the kernel */ + IPARAM_GPU_MASK, /* The mask of GPU to be used */ IPARAM_SIZEOF }; From c90f0bfcf2773dcbcecf7fde13b262f6601fe1ab Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 7 Aug 2024 22:43:56 -0700 Subject: [PATCH 2/3] Dont use zone_malloc to manage the temporary workspace. Signed-off-by: George Bosilca --- src/zpotrf_U.jdf | 1 + src/zpotrf_wrapper.c | 28 ++++++++++++++++++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/zpotrf_U.jdf b/src/zpotrf_U.jdf index 3f69415e..7e3ab630 100644 --- a/src/zpotrf_U.jdf +++ b/src/zpotrf_U.jdf @@ -134,6 +134,7 @@ BODY [type=CUDA] handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); assert(NULL != handles); wp = parsec_info_get(&gpu_device->super.infos, POWorkspaceID); + if( NULL == wp ) return PARSEC_HOOK_RETURN_AGAIN; assert(NULL != wp); workspace = (cuDoubleComplex*)wp->tmpmem; diff --git a/src/zpotrf_wrapper.c b/src/zpotrf_wrapper.c index 8699ee0d..002bc796 100644 --- a/src/zpotrf_wrapper.c +++ b/src/zpotrf_wrapper.c @@ -58,11 +58,11 @@ dplasma_zpotrf_setrecursive( parsec_taskpool_t *tp, int hmb ) } } +#define USE_PARSEC_ZONE_FOR_WP_MEMORY 0 + #if defined(DPLASMA_HAVE_CUDA) void *zpotrf_create_workspace(void *obj, void *user) { - parsec_device_module_t *mod = (parsec_device_module_t *)obj; - zone_malloc_t *memory = ((parsec_device_cuda_module_t*)mod)->super.memory; cusolverDnHandle_t cusolverDnHandle; cusolverStatus_t status; parsec_zpotrf_U_taskpool_t *tp = (parsec_zpotrf_U_taskpool_t*)user; @@ -73,6 +73,7 @@ void *zpotrf_create_workspace(void *obj, void *user) size_t elt_size = sizeof(cuDoubleComplex); cublasFillMode_t cublas_uplo; dplasma_enum_t uplo = tp->_g_uplo; + void* tmpmem; if( PlasmaLower == uplo ) cublas_uplo = CUBLAS_FILL_MODE_LOWER; @@ -87,12 +88,27 @@ void *zpotrf_create_workspace(void *obj, void *user) assert(CUSOLVER_STATUS_SUCCESS == status); cusolverDnDestroy(cusolverDnHandle); - +#if USE_PARSEC_ZONE_FOR_WP_MEMORY + parsec_device_module_t *mod = (parsec_device_module_t *)obj; + zone_malloc_t *memory = ((parsec_device_cuda_module_t*)mod)->super.memory; + tmpmem = zone_malloc(memory, workspace_size * elt_size + sizeof(int)); + if( NULL == tmpmem ) + return NULL; +#else + (void)obj; + cudaError_t rc = cudaMalloc(&tmpmem, workspace_size * elt_size + sizeof(int)); + if( cudaSuccess != rc ) + return NULL; +#endif /* USE_PARSEC_ZONE_FOR_WP_MEMORY */ wp = (dplasma_potrf_workspace_t*)malloc(sizeof(dplasma_potrf_workspace_t)); - wp->tmpmem = zone_malloc(memory, workspace_size * elt_size + sizeof(int)); + wp->tmpmem = tmpmem; assert(NULL != wp->tmpmem); wp->lwork = workspace_size; +#if USE_PARSEC_ZONE_FOR_WP_MEMORY wp->memory = memory; +#else + wp->memory = NULL; +#endif /* USE_PARSEC_ZONE_FOR_WP_MEMORY */ return wp; } @@ -100,7 +116,11 @@ void *zpotrf_create_workspace(void *obj, void *user) static void destroy_workspace(void *_ws, void *_n) { dplasma_potrf_workspace_t *ws = (dplasma_potrf_workspace_t*)_ws; +#if USE_PARSEC_ZONE_FOR_WP_MEMORY zone_free((zone_malloc_t*)ws->memory, ws->tmpmem); +#else + (void)cudaFree(ws->tmpmem); +#endif /* USE_PARSEC_ZONE_FOR_WP_MEMORY */ free(ws); (void)_n; } From 311c722993cc9fefccd7f924dbbcadfd44dc265a Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 7 Aug 2024 22:46:50 -0700 Subject: [PATCH 3/3] Improve the device matching. Signed-off-by: George Bosilca --- parsec | 2 +- src/dplasmajdf_lapack_dtt.h | 12 ++++++------ src/zgemm_NN_gpu.jdf | 2 +- src/zgemm_wrapper.c | 6 +++--- tests/testing_zgelqf.c | 4 ++-- tests/testing_zgelqf_hqr.c | 4 ++-- tests/testing_zgelqf_systolic.c | 4 ++-- tests/testing_zgeqrf_hqr.c | 4 ++-- tests/testing_zgeqrf_systolic.c | 4 ++-- tests/testing_zgesvd.c | 4 ++-- tests/testing_zgetrf_incpiv.c | 4 ++-- tests/testing_zgetrf_incpiv_dtd.c | 4 ++-- tests/testing_zheev.c | 4 ++-- tests/testing_zpotrf.c | 2 -- 14 files changed, 29 insertions(+), 31 deletions(-) diff --git a/parsec b/parsec index adabbd4d..7f6bdd5b 160000 --- a/parsec +++ b/parsec @@ -1 +1 @@ -Subproject commit adabbd4d1fb580358a32d489df19fa9c05a316e1 +Subproject commit 7f6bdd5b8e7fdbbda28f982829a87e1ea5394350 diff --git a/src/dplasmajdf_lapack_dtt.h b/src/dplasmajdf_lapack_dtt.h index 4937fb72..13eb4942 100644 --- a/src/dplasmajdf_lapack_dtt.h +++ b/src/dplasmajdf_lapack_dtt.h @@ -150,12 +150,12 @@ stage_in_lapack(parsec_gpu_task_t *gtask, assert(ddc != NULL); elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype); in_elem_dev = (parsec_device_cuda_module_t*)parsec_mca_device_get( copy_in->device_index); - if( (in_elem_dev->super.super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){ + if( (in_elem_dev->super.super.type & PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){ ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private, copy_in->device_private, gtask->flow_nb_elts[i], - (in_elem_dev->super.super.type != PARSEC_DEV_CUDA)? - cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice, + (in_elem_dev->super.super.type & PARSEC_DEV_CUDA)? + cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice, cuda_stream->cuda_stream); PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); }else{ @@ -227,12 +227,12 @@ stage_out_lapack(parsec_gpu_task_t *gtask, elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype); out_elem_dev = (parsec_device_cuda_module_t*)parsec_mca_device_get( copy_out->device_index); - if( (out_elem_dev->super.super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){ + if( (out_elem_dev->super.super.type & PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){ ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private, copy_in->device_private, gtask->flow_nb_elts[i], - out_elem_dev->super.super.type != PARSEC_DEV_CUDA ? - cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice, + out_elem_dev->super.super.type & PARSEC_DEV_CUDA ? + cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost, cuda_stream->cuda_stream); PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); }else{ diff --git a/src/zgemm_NN_gpu.jdf b/src/zgemm_NN_gpu.jdf index eb723c32..607a0d7b 100644 --- a/src/zgemm_NN_gpu.jdf +++ b/src/zgemm_NN_gpu.jdf @@ -413,7 +413,7 @@ BODY [type=CUDA] __parsec_zgemm_NN_gpu_GEMM_task_t next_gemm; memcpy(&next_gemm, this_task, sizeof(__parsec_zgemm_NN_gpu_GEMM_task_t)); next_gemm.locals.k.value = descC->mt -1; - assert( PARSEC_DEV_CUDA == next_gemm.task_class->incarnations[chore_id].type ); + assert( PARSEC_DEV_CUDA & next_gemm.task_class->incarnations[chore_id].type ); if(NULL != next_gemm.task_class->incarnations[chore_id].evaluate) { if( next_gemm.task_class->incarnations[chore_id].evaluate((parsec_task_t*)&next_gemm) == PARSEC_HOOK_RETURN_NEXT ) { diff --git a/src/zgemm_wrapper.c b/src/zgemm_wrapper.c index e2d5ebb7..fd1ce158 100644 --- a/src/zgemm_wrapper.c +++ b/src/zgemm_wrapper.c @@ -222,7 +222,7 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, nbgpu = 0; for(dev = 0; dev < (int)parsec_nb_devices; dev++) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)device; nbgpu++; if( 0 == gpu_mem_block_size ) @@ -239,7 +239,7 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, nbgpu= 0; for(dev = 0; dev < (int)parsec_nb_devices; dev++) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { dev_index[nbgpu++] = device->device_index; } } @@ -461,7 +461,7 @@ dplasma_zgemm_New_ex( dplasma_enum_t transA, dplasma_enum_t transB, int64_t gpu_mem_nb_blocks = -1; for(devid = 0; devid < (int)parsec_nb_devices; devid++) { parsec_device_module_t *device = parsec_mca_device_get(devid); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)device; nb_gpu_devices++; if( 0 == gpu_mem_block_size ) diff --git a/tests/testing_zgelqf.c b/tests/testing_zgelqf.c index 4e592cde..c962f700 100644 --- a/tests/testing_zgelqf.c +++ b/tests/testing_zgelqf.c @@ -417,7 +417,7 @@ static void warmup_zgelqf(int rank, int random_seed, parsec_context_t *parsec) for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) { for(int i = 0; i < (int)zgelqf->nb_task_classes; i++) { for(int j = 0; NULL != zgelqf->task_classes_array[i]->incarnations[j].hook; j++) { - if( zgelqf->task_classes_array[i]->incarnations[j].type == dtype ) { + if( zgelqf->task_classes_array[i]->incarnations[j].type & dtype ) { goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */ } } @@ -426,7 +426,7 @@ static void warmup_zgelqf(int rank, int random_seed, parsec_context_t *parsec) do_run: for(int did = 0; did < (int)parsec_nb_devices; did++) { parsec_device_module_t *dev = parsec_mca_device_get(did); - if(dev->type != dtype) + if( !(dev->type & dtype) ) continue; /* This should work, right? Unfortunately, we can't test until there is a -enabled implementation for this test */ for(int m = 0; m < MT; m++) { diff --git a/tests/testing_zgelqf_hqr.c b/tests/testing_zgelqf_hqr.c index 147d6bcc..be2d1349 100644 --- a/tests/testing_zgelqf_hqr.c +++ b/tests/testing_zgelqf_hqr.c @@ -471,7 +471,7 @@ static void warmup_hqr(parsec_context_t *parsec, int *iparam) for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) { for(int i = 0; i < (int)zgelqf_param_tp->nb_task_classes; i++) { for(int j = 0; NULL != zgelqf_param_tp->task_classes_array[i]->incarnations[j].hook; j++) { - if( zgelqf_param_tp->task_classes_array[i]->incarnations[j].type == dtype ) { + if( zgelqf_param_tp->task_classes_array[i]->incarnations[j].type & dtype ) { goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */ } } @@ -480,7 +480,7 @@ static void warmup_hqr(parsec_context_t *parsec, int *iparam) do_run: for(int did = 0; did < (int)parsec_nb_devices; did++) { parsec_device_module_t *dev = parsec_mca_device_get(did); - if(dev->type != dtype) + if( !(dev->type & dtype) ) continue; /* This should work, right? Unfortunately, we can't test until there is a -enabled implementation for this test */ for(int m = 0; m < MT; m++) { diff --git a/tests/testing_zgelqf_systolic.c b/tests/testing_zgelqf_systolic.c index 08b7b64e..0afe9187 100644 --- a/tests/testing_zgelqf_systolic.c +++ b/tests/testing_zgelqf_systolic.c @@ -465,7 +465,7 @@ static void warmup_hqr(parsec_context_t *parsec, int *iparam) for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) { for(int i = 0; i < (int)zgelqf_sys_tp->nb_task_classes; i++) { for(int j = 0; NULL != zgelqf_sys_tp->task_classes_array[i]->incarnations[j].hook; j++) { - if( zgelqf_sys_tp->task_classes_array[i]->incarnations[j].type == dtype ) { + if( zgelqf_sys_tp->task_classes_array[i]->incarnations[j].type & dtype ) { goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */ } } @@ -474,7 +474,7 @@ static void warmup_hqr(parsec_context_t *parsec, int *iparam) do_run: for(int did = 0; did < (int)parsec_nb_devices; did++) { parsec_device_module_t *dev = parsec_mca_device_get(did); - if(dev->type != dtype) + if( !(dev->type & dtype) ) continue; /* This should work, right? Unfortunately, we can't test until there is a -enabled implementation for this test */ for(int m = 0; m < MT; m++) { diff --git a/tests/testing_zgeqrf_hqr.c b/tests/testing_zgeqrf_hqr.c index 09d260a1..bfa4cc7d 100644 --- a/tests/testing_zgeqrf_hqr.c +++ b/tests/testing_zgeqrf_hqr.c @@ -471,7 +471,7 @@ static void warmup_zgeqrf_hqr(int rank, int random_seed, int *iparam, parsec_con for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) { for(int i = 0; i < (int)zgeqrf_hqr->nb_task_classes; i++) { for(int j = 0; NULL != zgeqrf_hqr->task_classes_array[i]->incarnations[j].hook; j++) { - if( zgeqrf_hqr->task_classes_array[i]->incarnations[j].type == dtype ) { + if( zgeqrf_hqr->task_classes_array[i]->incarnations[j].type & dtype ) { goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */ } } @@ -480,7 +480,7 @@ static void warmup_zgeqrf_hqr(int rank, int random_seed, int *iparam, parsec_con do_run: for(int did = 0; did < (int)parsec_nb_devices; did++) { parsec_device_module_t *dev = parsec_mca_device_get(did); - if(dev->type != dtype) + if( !(dev->type & dtype) ) continue; /* This should work, right? Unfortunately, we can't test until there is a -enabled implementation for this test */ for(int m = 0; m < MT; m++) { diff --git a/tests/testing_zgeqrf_systolic.c b/tests/testing_zgeqrf_systolic.c index 8f9db2a8..96e2a0e9 100644 --- a/tests/testing_zgeqrf_systolic.c +++ b/tests/testing_zgeqrf_systolic.c @@ -457,7 +457,7 @@ static void warmup_zgeqrf_systolic(int rank, int random_seed, int *iparam, parse for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) { for(int i = 0; i < (int)zgeqrf_systolic->nb_task_classes; i++) { for(int j = 0; NULL != zgeqrf_systolic->task_classes_array[i]->incarnations[j].hook; j++) { - if( zgeqrf_systolic->task_classes_array[i]->incarnations[j].type == dtype ) { + if( zgeqrf_systolic->task_classes_array[i]->incarnations[j].type & dtype ) { goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */ } } @@ -466,7 +466,7 @@ static void warmup_zgeqrf_systolic(int rank, int random_seed, int *iparam, parse do_run: for(int did = 0; did < (int)parsec_nb_devices; did++) { parsec_device_module_t *dev = parsec_mca_device_get(did); - if(dev->type != dtype) + if( !(dev->type & dtype) ) continue; /* This should work, right? Unfortunately, we can't test until there is a -enabled implementation for this test */ for(int m = 0; m < MT; m++) { diff --git a/tests/testing_zgesvd.c b/tests/testing_zgesvd.c index 02787cd7..70c04287 100644 --- a/tests/testing_zgesvd.c +++ b/tests/testing_zgesvd.c @@ -330,7 +330,7 @@ static void warmup_zgesvd(int rank, int random_seed, parsec_context_t *parsec) for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) { for(int i = 0; i < (int)zgesvd->nb_task_classes; i++) { for(int j = 0; NULL != zgesvd->task_classes_array[i]->incarnations[j].hook; j++) { - if( zgesvd->task_classes_array[i]->incarnations[j].type == dtype ) { + if( zgesvd->task_classes_array[i]->incarnations[j].type & dtype ) { goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */ } } @@ -339,7 +339,7 @@ static void warmup_zgesvd(int rank, int random_seed, parsec_context_t *parsec) do_run: for(int did = 0; did < (int)parsec_nb_devices; did++) { parsec_device_module_t *dev = parsec_mca_device_get(did); - if(dev->type != dtype) + if( !(dev->type & dtype) ) continue; /* This should work, right? Unfortunately, we can't test until there is a -enabled implementation for this test */ for(int m = 0; m < MT; m++) { diff --git a/tests/testing_zgetrf_incpiv.c b/tests/testing_zgetrf_incpiv.c index 2eef974c..6705fd3f 100644 --- a/tests/testing_zgetrf_incpiv.c +++ b/tests/testing_zgetrf_incpiv.c @@ -313,7 +313,7 @@ static void warmup_zgetrf(int rank, int random_seed, parsec_context_t *parsec) for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) { for(int i = 0; i < (int)zgetrf_incpiv->nb_task_classes; i++) { for(int j = 0; NULL != zgetrf_incpiv->task_classes_array[i]->incarnations[j].hook; j++) { - if( zgetrf_incpiv->task_classes_array[i]->incarnations[j].type == dtype ) { + if( zgetrf_incpiv->task_classes_array[i]->incarnations[j].type & dtype ) { goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */ } } @@ -322,7 +322,7 @@ static void warmup_zgetrf(int rank, int random_seed, parsec_context_t *parsec) do_run: for(int did = 0; did < (int)parsec_nb_devices; did++) { parsec_device_module_t *dev = parsec_mca_device_get(did); - if(dev->type != dtype) + if( !(dev->type & dtype) ) continue; /* This should work, right? Unfortunately, we can't test until there is a -enabled implementation for this test */ for(int m = 0; m < MT; m++) { diff --git a/tests/testing_zgetrf_incpiv_dtd.c b/tests/testing_zgetrf_incpiv_dtd.c index 298729c9..d98d25ac 100644 --- a/tests/testing_zgetrf_incpiv_dtd.c +++ b/tests/testing_zgetrf_incpiv_dtd.c @@ -616,7 +616,7 @@ static void warmup_zgetrf(int rank, int random_seed, parsec_context_t *parsec) for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) { for(int i = 0; i < (int)zgetrf_incpiv->nb_task_classes; i++) { for(int j = 0; NULL != zgetrf_incpiv->task_classes_array[i]->incarnations[j].hook; j++) { - if( zgetrf_incpiv->task_classes_array[i]->incarnations[j].type == dtype ) { + if( zgetrf_incpiv->task_classes_array[i]->incarnations[j].type & dtype ) { goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */ } } @@ -625,7 +625,7 @@ static void warmup_zgetrf(int rank, int random_seed, parsec_context_t *parsec) do_run: for(int did = 0; did < (int)parsec_nb_devices; did++) { parsec_device_module_t *dev = parsec_mca_device_get(did); - if(dev->type != dtype) + if( !(dev->type & dtype) ) continue; /* This should work, right? Unfortunately, we can't test until there is a -enabled implementation for this test */ for(int m = 0; m < MT; m++) { diff --git a/tests/testing_zheev.c b/tests/testing_zheev.c index c2845e50..cbd49002 100644 --- a/tests/testing_zheev.c +++ b/tests/testing_zheev.c @@ -350,7 +350,7 @@ static void warmup_zherbt(int rank, int random_seed, int uplo, parsec_context_t for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) { for(int i = 0; i < (int)zherbt->nb_task_classes; i++) { for(int j = 0; NULL != zherbt->task_classes_array[i]->incarnations[j].hook; j++) { - if( zherbt->task_classes_array[i]->incarnations[j].type == dtype ) { + if( zherbt->task_classes_array[i]->incarnations[j].type & dtype ) { goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */ } } @@ -359,7 +359,7 @@ static void warmup_zherbt(int rank, int random_seed, int uplo, parsec_context_t do_run: for(int did = 0; did < (int)parsec_nb_devices; did++) { parsec_device_module_t *dev = parsec_mca_device_get(did); - if(dev->type != dtype) + if( !(dev->type & dtype) ) continue; /* This should work, right? Unfortunately, we can't test until there is a -enabled implementation for this test */ for(int m = 0; m < MT; m++) { diff --git a/tests/testing_zpotrf.c b/tests/testing_zpotrf.c index 7d0fa3b7..68ca276e 100644 --- a/tests/testing_zpotrf.c +++ b/tests/testing_zpotrf.c @@ -67,7 +67,6 @@ int main(int argc, char ** argv) dplasma_zpotrf_Destruct( PARSEC_zpotrf ); parsec_taskpool_sync_ids(); /* recursive DAGs are not synchronous on ids */ - } else { @@ -76,7 +75,6 @@ int main(int argc, char ** argv) dplasma_zpotrf_Destruct( PARSEC_zpotrf )); } parsec_devices_reset_load(parsec); - } if( 0 == rank && info != 0 ) {