From 7655f876de176ad26ce39c62f6e214f07fc2295e Mon Sep 17 00:00:00 2001
From: George Bosilca <gbosilca@nvidia.com>
Date: Wed, 7 Aug 2024 22:41:33 -0700
Subject: [PATCH 1/3] Add support for GPU mask.

Leave set to -1 to allow parsec to spread the node processes across all
available GPUs, while respecting the number of requested GPUs. Set to
some other value to force parsec to use a specific set of GPUs.

Signed-off-by: George Bosilca <gbosilca@nvidia.com>
---
 tests/common.c | 52 ++++++++++++++++++++++++++++++++++++++++----------
 tests/common.h |  1 +
 2 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/tests/common.c b/tests/common.c
index 4e390732..e981bfd8 100644
--- a/tests/common.c
+++ b/tests/common.c
@@ -124,6 +124,7 @@ void print_usage(void)
             "\n"
             " -c --cores        : number of concurent threads (default: number of physical hyper-threads)\n"
             " -g --gpus         : number of GPU (default: 0)\n"
+	    " -D --gpu_mask     : mask of the GPUs to be used by this process either a positive number to indicate the GPU mask or -1 (to use the GPUs module the local rank of the process)\n"
             " -m --thread_multi : initialize MPI_THREAD_MULTIPLE (default: no)\n"
             " -o --scheduler    : select the scheduler (default: LFQ)\n"
             "                     Accepted values:\n"
@@ -181,6 +182,7 @@ static struct option long_options[] =
     {"scheduler",   required_argument,  0, 'o'},
     {"gpus",        required_argument,  0, 'g'},
     {"g",           required_argument,  0, 'g'},
+    {"D",           required_argument,  0, 'D'},
     {"V",           required_argument,  0, 'V'},
     {"vpmap",       required_argument,  0, 'V'},
     {"ht",          required_argument,  0, 'H'},
@@ -324,6 +326,16 @@ static void read_arguments(int *_argc, char*** _argv, int* iparam)
                 }
                 iparam[IPARAM_NGPUS] = atoi(optarg);
                 break;
+            case 'D':
+#if !defined(DPLASMA_HAVE_CUDA)
+                iparam[IPARAM_GPU_MASK] = DPLASMA_ERR_NOT_SUPPORTED; /* force an error message */
+#endif
+                if(iparam[IPARAM_GPU_MASK] == DPLASMA_ERR_NOT_SUPPORTED) {
+                    fprintf(stderr, "#!!!!! This test does not have GPU support. GPU disabled.\n");
+                    break;
+                }
+                iparam[IPARAM_GPU_MASK] = atoi(optarg);
+                break;
 
             case 'p': case 'P': iparam[IPARAM_P] = atoi(optarg); break;
             case 'q': case 'Q': iparam[IPARAM_Q] = atoi(optarg); break;
@@ -448,7 +460,6 @@ static void parse_arguments(int *iparam) {
         parsec_setenv_mca_param( "device_cuda_enabled", value, &environ );
         free(value);
     }
-
     /* Check the process grid */
     if(0 == iparam[IPARAM_P])
         iparam[IPARAM_P] = iparam[IPARAM_NNODES];
@@ -510,6 +521,25 @@ static void parse_arguments(int *iparam) {
     /* HQR */
     if(-'P' == iparam[IPARAM_QR_HLVL_SZE]) iparam[IPARAM_QR_HLVL_SZE] = iparam[IPARAM_P];
     if(-'Q' == iparam[IPARAM_QR_HLVL_SZE]) iparam[IPARAM_QR_HLVL_SZE] = iparam[IPARAM_Q];
+#if defined(DPLASMA_HAVE_CUDA) && defined(PARSEC_HAVE_MPI)
+    if(iparam[IPARAM_NGPUS] > 0 && (-1 == iparam[IPARAM_GPU_MASK])) {
+        MPI_Comm local_comm;
+        int local_rank, local_size;
+        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
+                            MPI_INFO_NULL, &local_comm);
+        MPI_Comm_rank(local_comm, &local_rank);
+        MPI_Comm_size(local_comm, &local_size);
+        MPI_Comm_free(&local_comm);
+        iparam[IPARAM_GPU_MASK] = 0;
+        for( int i = 0; i <= iparam[IPARAM_NGPUS]; i++ ) {
+            iparam[IPARAM_GPU_MASK] |= ((1 << local_rank) << i);
+        }
+        char* value;
+        asprintf(&value, "%d", iparam[IPARAM_GPU_MASK]);
+        parsec_setenv_mca_param("device_cuda_mask", value, &environ);
+        free(value); value = NULL;
+    }
+#endif  /* defined(DPLASMA_HAVE_CUDA) && defined(PARSEC_HAVE_MPI) */
 }
 
 static void print_arguments(int* iparam)
@@ -698,21 +728,23 @@ parsec_context_t* setup_parsec(int argc, char **argv, int *iparam)
     int dev, nbgpu = 0;
     for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
         parsec_device_module_t *device = parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             nbgpu++;
         }
     }
-    if( nbgpu > 0 ) {
+    if( iparam[IPARAM_NGPUS] < 0 ) {
+        iparam[IPARAM_NGPUS] = nbgpu;
+    }
+    if( iparam[IPARAM_NGPUS] > 0 ) {
+        if(iparam[IPARAM_VERBOSE] >= 3) {
+            parsec_setenv_mca_param( "device_show_statistics", "1", &environ );
+        }
         CuHI = parsec_info_register(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES",
-                                   dplasma_destroy_cuda_handles, NULL,
-                                   dplasma_create_cuda_handles, NULL,
-                                   NULL);
+                                    dplasma_destroy_cuda_handles, NULL,
+                                    dplasma_create_cuda_handles, NULL,
+                                    NULL);
         assert(-1 != CuHI);
     }
-    iparam[IPARAM_NGPUS] = nbgpu;
-    if(iparam[IPARAM_NGPUS] > 0 && iparam[IPARAM_VERBOSE] >= 3) {
-        parsec_setenv_mca_param( "device_show_statistics", "1", &environ );
-    }
 #endif
 
     print_arguments(iparam);
diff --git a/tests/common.h b/tests/common.h
index f420329e..c4b15253 100644
--- a/tests/common.h
+++ b/tests/common.h
@@ -74,6 +74,7 @@ enum iparam_t {
   IPARAM_BUT_LEVEL,    /* Butterfly level */
   IPARAM_SCHEDULER,    /* User-selected scheduler */
   IPARAM_NRUNS,        /* Number of times to run the kernel */
+  IPARAM_GPU_MASK,     /* The mask of GPU to be used */
   IPARAM_SIZEOF
 };
 

From c90f0bfcf2773dcbcecf7fde13b262f6601fe1ab Mon Sep 17 00:00:00 2001
From: George Bosilca <gbosilca@nvidia.com>
Date: Wed, 7 Aug 2024 22:43:56 -0700
Subject: [PATCH 2/3] Dont use zone_malloc to manage the temporary workspace.

Signed-off-by: George Bosilca <gbosilca@nvidia.com>
---
 src/zpotrf_U.jdf     |  1 +
 src/zpotrf_wrapper.c | 28 ++++++++++++++++++++++++----
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/src/zpotrf_U.jdf b/src/zpotrf_U.jdf
index 3f69415e..7e3ab630 100644
--- a/src/zpotrf_U.jdf
+++ b/src/zpotrf_U.jdf
@@ -134,6 +134,7 @@ BODY [type=CUDA]
     handles = parsec_info_get(&gpu_stream->infos, CuHandlesID);
     assert(NULL != handles);
     wp = parsec_info_get(&gpu_device->super.infos, POWorkspaceID);
+    if( NULL == wp ) return PARSEC_HOOK_RETURN_AGAIN;
     assert(NULL != wp);
 
     workspace = (cuDoubleComplex*)wp->tmpmem;
diff --git a/src/zpotrf_wrapper.c b/src/zpotrf_wrapper.c
index 8699ee0d..002bc796 100644
--- a/src/zpotrf_wrapper.c
+++ b/src/zpotrf_wrapper.c
@@ -58,11 +58,11 @@ dplasma_zpotrf_setrecursive( parsec_taskpool_t *tp, int hmb )
     }
 }
 
+#define USE_PARSEC_ZONE_FOR_WP_MEMORY 0
+
 #if defined(DPLASMA_HAVE_CUDA)
 void *zpotrf_create_workspace(void *obj, void *user)
 {
-    parsec_device_module_t *mod = (parsec_device_module_t *)obj;
-    zone_malloc_t *memory = ((parsec_device_cuda_module_t*)mod)->super.memory;
     cusolverDnHandle_t cusolverDnHandle;
     cusolverStatus_t status;
     parsec_zpotrf_U_taskpool_t *tp = (parsec_zpotrf_U_taskpool_t*)user;
@@ -73,6 +73,7 @@ void *zpotrf_create_workspace(void *obj, void *user)
     size_t elt_size = sizeof(cuDoubleComplex);
     cublasFillMode_t cublas_uplo;
     dplasma_enum_t uplo = tp->_g_uplo;
+    void* tmpmem;
 
     if( PlasmaLower == uplo )
         cublas_uplo = CUBLAS_FILL_MODE_LOWER;
@@ -87,12 +88,27 @@ void *zpotrf_create_workspace(void *obj, void *user)
     assert(CUSOLVER_STATUS_SUCCESS == status);
 
     cusolverDnDestroy(cusolverDnHandle);
-
+#if USE_PARSEC_ZONE_FOR_WP_MEMORY
+    parsec_device_module_t *mod = (parsec_device_module_t *)obj;
+    zone_malloc_t *memory = ((parsec_device_cuda_module_t*)mod)->super.memory;
+    tmpmem = zone_malloc(memory, workspace_size * elt_size + sizeof(int));
+    if( NULL == tmpmem )
+        return NULL;
+#else
+    (void)obj;
+    cudaError_t rc = cudaMalloc(&tmpmem, workspace_size * elt_size + sizeof(int));
+    if( cudaSuccess != rc )
+        return NULL;
+#endif  /* USE_PARSEC_ZONE_FOR_WP_MEMORY */
     wp = (dplasma_potrf_workspace_t*)malloc(sizeof(dplasma_potrf_workspace_t));
-    wp->tmpmem = zone_malloc(memory, workspace_size * elt_size + sizeof(int));
+    wp->tmpmem = tmpmem;
     assert(NULL != wp->tmpmem);
     wp->lwork = workspace_size;
+#if USE_PARSEC_ZONE_FOR_WP_MEMORY
     wp->memory = memory;
+#else
+    wp->memory = NULL;
+#endif  /* USE_PARSEC_ZONE_FOR_WP_MEMORY */
 
     return wp;
 }
@@ -100,7 +116,11 @@ void *zpotrf_create_workspace(void *obj, void *user)
 static void destroy_workspace(void *_ws, void *_n)
 {
     dplasma_potrf_workspace_t *ws = (dplasma_potrf_workspace_t*)_ws;
+#if USE_PARSEC_ZONE_FOR_WP_MEMORY
     zone_free((zone_malloc_t*)ws->memory, ws->tmpmem);
+#else
+    (void)cudaFree(ws->tmpmem);
+#endif  /* USE_PARSEC_ZONE_FOR_WP_MEMORY */
     free(ws);
     (void)_n;
 }

From 311c722993cc9fefccd7f924dbbcadfd44dc265a Mon Sep 17 00:00:00 2001
From: George Bosilca <gbosilca@nvidia.com>
Date: Wed, 7 Aug 2024 22:46:50 -0700
Subject: [PATCH 3/3] Improve the device matching.

Signed-off-by: George Bosilca <gbosilca@nvidia.com>
---
 parsec                            |  2 +-
 src/dplasmajdf_lapack_dtt.h       | 12 ++++++------
 src/zgemm_NN_gpu.jdf              |  2 +-
 src/zgemm_wrapper.c               |  6 +++---
 tests/testing_zgelqf.c            |  4 ++--
 tests/testing_zgelqf_hqr.c        |  4 ++--
 tests/testing_zgelqf_systolic.c   |  4 ++--
 tests/testing_zgeqrf_hqr.c        |  4 ++--
 tests/testing_zgeqrf_systolic.c   |  4 ++--
 tests/testing_zgesvd.c            |  4 ++--
 tests/testing_zgetrf_incpiv.c     |  4 ++--
 tests/testing_zgetrf_incpiv_dtd.c |  4 ++--
 tests/testing_zheev.c             |  4 ++--
 tests/testing_zpotrf.c            |  2 --
 14 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/parsec b/parsec
index adabbd4d..7f6bdd5b 160000
--- a/parsec
+++ b/parsec
@@ -1 +1 @@
-Subproject commit adabbd4d1fb580358a32d489df19fa9c05a316e1
+Subproject commit 7f6bdd5b8e7fdbbda28f982829a87e1ea5394350
diff --git a/src/dplasmajdf_lapack_dtt.h b/src/dplasmajdf_lapack_dtt.h
index 4937fb72..13eb4942 100644
--- a/src/dplasmajdf_lapack_dtt.h
+++ b/src/dplasmajdf_lapack_dtt.h
@@ -150,12 +150,12 @@ stage_in_lapack(parsec_gpu_task_t *gtask,
             assert(ddc != NULL);
             elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype);
             in_elem_dev = (parsec_device_cuda_module_t*)parsec_mca_device_get( copy_in->device_index);
-            if( (in_elem_dev->super.super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){
+            if( (in_elem_dev->super.super.type & PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){
                 ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private,
                                                     copy_in->device_private,
                                                     gtask->flow_nb_elts[i],
-                                                    (in_elem_dev->super.super.type != PARSEC_DEV_CUDA)?
-                                                            cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice,
+                                                    (in_elem_dev->super.super.type & PARSEC_DEV_CUDA)?
+                                                            cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice,
                                                     cuda_stream->cuda_stream);
                 PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } );
             }else{
@@ -227,12 +227,12 @@ stage_out_lapack(parsec_gpu_task_t *gtask,
             elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype);
             out_elem_dev = (parsec_device_cuda_module_t*)parsec_mca_device_get( copy_out->device_index);
 
-            if( (out_elem_dev->super.super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){
+            if( (out_elem_dev->super.super.type & PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){
                 ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private,
                                                     copy_in->device_private,
                                                     gtask->flow_nb_elts[i],
-                                                    out_elem_dev->super.super.type != PARSEC_DEV_CUDA ?
-                                                            cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice,
+                                                    out_elem_dev->super.super.type & PARSEC_DEV_CUDA ?
+                                                            cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost,
                                                     cuda_stream->cuda_stream);
                 PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } );
             }else{
diff --git a/src/zgemm_NN_gpu.jdf b/src/zgemm_NN_gpu.jdf
index eb723c32..607a0d7b 100644
--- a/src/zgemm_NN_gpu.jdf
+++ b/src/zgemm_NN_gpu.jdf
@@ -413,7 +413,7 @@ BODY [type=CUDA]
         __parsec_zgemm_NN_gpu_GEMM_task_t next_gemm;
         memcpy(&next_gemm, this_task, sizeof(__parsec_zgemm_NN_gpu_GEMM_task_t));
         next_gemm.locals.k.value = descC->mt -1;
-        assert( PARSEC_DEV_CUDA == next_gemm.task_class->incarnations[chore_id].type );
+        assert( PARSEC_DEV_CUDA & next_gemm.task_class->incarnations[chore_id].type );
         if(NULL != next_gemm.task_class->incarnations[chore_id].evaluate) {
             if( next_gemm.task_class->incarnations[chore_id].evaluate((parsec_task_t*)&next_gemm) ==
                 PARSEC_HOOK_RETURN_NEXT ) {
diff --git a/src/zgemm_wrapper.c b/src/zgemm_wrapper.c
index e2d5ebb7..fd1ce158 100644
--- a/src/zgemm_wrapper.c
+++ b/src/zgemm_wrapper.c
@@ -222,7 +222,7 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB,
     nbgpu = 0;
     for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
         parsec_device_module_t *device = parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)device;
             nbgpu++;
             if( 0 == gpu_mem_block_size )
@@ -239,7 +239,7 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB,
     nbgpu= 0;
     for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
         parsec_device_module_t *device = parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             dev_index[nbgpu++] = device->device_index;
         }
     }
@@ -461,7 +461,7 @@ dplasma_zgemm_New_ex( dplasma_enum_t transA, dplasma_enum_t transB,
 		int64_t gpu_mem_nb_blocks = -1;
         for(devid = 0; devid < (int)parsec_nb_devices; devid++) {
             parsec_device_module_t *device = parsec_mca_device_get(devid);
-            if( PARSEC_DEV_CUDA == device->type ) {
+            if( PARSEC_DEV_CUDA & device->type ) {
 				parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)device;
                 nb_gpu_devices++;
 				if( 0 == gpu_mem_block_size )
diff --git a/tests/testing_zgelqf.c b/tests/testing_zgelqf.c
index 4e592cde..c962f700 100644
--- a/tests/testing_zgelqf.c
+++ b/tests/testing_zgelqf.c
@@ -417,7 +417,7 @@ static void warmup_zgelqf(int rank, int random_seed, parsec_context_t *parsec)
     for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
         for(int i = 0; i < (int)zgelqf->nb_task_classes; i++) {
             for(int j = 0; NULL != zgelqf->task_classes_array[i]->incarnations[j].hook; j++) {
-                if( zgelqf->task_classes_array[i]->incarnations[j].type == dtype ) {
+                if( zgelqf->task_classes_array[i]->incarnations[j].type & dtype ) {
                     goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
                 }
             }
@@ -426,7 +426,7 @@ static void warmup_zgelqf(int rank, int random_seed, parsec_context_t *parsec)
     do_run:
         for(int did = 0; did < (int)parsec_nb_devices; did++) {
             parsec_device_module_t *dev = parsec_mca_device_get(did);
-            if(dev->type != dtype)
+            if( !(dev->type & dtype) )
                 continue;
             /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
             for(int m = 0; m < MT; m++) {
diff --git a/tests/testing_zgelqf_hqr.c b/tests/testing_zgelqf_hqr.c
index 147d6bcc..be2d1349 100644
--- a/tests/testing_zgelqf_hqr.c
+++ b/tests/testing_zgelqf_hqr.c
@@ -471,7 +471,7 @@ static void warmup_hqr(parsec_context_t *parsec, int *iparam)
     for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
         for(int i = 0; i < (int)zgelqf_param_tp->nb_task_classes; i++) {
             for(int j = 0; NULL != zgelqf_param_tp->task_classes_array[i]->incarnations[j].hook; j++) {
-                if( zgelqf_param_tp->task_classes_array[i]->incarnations[j].type == dtype ) {
+                if( zgelqf_param_tp->task_classes_array[i]->incarnations[j].type & dtype ) {
                     goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
                 }
             }
@@ -480,7 +480,7 @@ static void warmup_hqr(parsec_context_t *parsec, int *iparam)
     do_run:
         for(int did = 0; did < (int)parsec_nb_devices; did++) {
             parsec_device_module_t *dev = parsec_mca_device_get(did);
-            if(dev->type != dtype)
+            if( !(dev->type & dtype) )
                 continue;
             /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
             for(int m = 0; m < MT; m++) {
diff --git a/tests/testing_zgelqf_systolic.c b/tests/testing_zgelqf_systolic.c
index 08b7b64e..0afe9187 100644
--- a/tests/testing_zgelqf_systolic.c
+++ b/tests/testing_zgelqf_systolic.c
@@ -465,7 +465,7 @@ static void warmup_hqr(parsec_context_t *parsec, int *iparam)
     for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
         for(int i = 0; i < (int)zgelqf_sys_tp->nb_task_classes; i++) {
             for(int j = 0; NULL != zgelqf_sys_tp->task_classes_array[i]->incarnations[j].hook; j++) {
-                if( zgelqf_sys_tp->task_classes_array[i]->incarnations[j].type == dtype ) {
+                if( zgelqf_sys_tp->task_classes_array[i]->incarnations[j].type & dtype ) {
                     goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
                 }
             }
@@ -474,7 +474,7 @@ static void warmup_hqr(parsec_context_t *parsec, int *iparam)
     do_run:
         for(int did = 0; did < (int)parsec_nb_devices; did++) {
             parsec_device_module_t *dev = parsec_mca_device_get(did);
-            if(dev->type != dtype)
+            if( !(dev->type & dtype) )
                 continue;
             /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
             for(int m = 0; m < MT; m++) {
diff --git a/tests/testing_zgeqrf_hqr.c b/tests/testing_zgeqrf_hqr.c
index 09d260a1..bfa4cc7d 100644
--- a/tests/testing_zgeqrf_hqr.c
+++ b/tests/testing_zgeqrf_hqr.c
@@ -471,7 +471,7 @@ static void warmup_zgeqrf_hqr(int rank, int random_seed, int *iparam, parsec_con
     for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
         for(int i = 0; i < (int)zgeqrf_hqr->nb_task_classes; i++) {
             for(int j = 0; NULL != zgeqrf_hqr->task_classes_array[i]->incarnations[j].hook; j++) {
-                if( zgeqrf_hqr->task_classes_array[i]->incarnations[j].type == dtype ) {
+                if( zgeqrf_hqr->task_classes_array[i]->incarnations[j].type & dtype ) {
                     goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
                 }
             }
@@ -480,7 +480,7 @@ static void warmup_zgeqrf_hqr(int rank, int random_seed, int *iparam, parsec_con
     do_run:
         for(int did = 0; did < (int)parsec_nb_devices; did++) {
             parsec_device_module_t *dev = parsec_mca_device_get(did);
-            if(dev->type != dtype)
+            if( !(dev->type & dtype) )
                 continue;
             /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
             for(int m = 0; m < MT; m++) {
diff --git a/tests/testing_zgeqrf_systolic.c b/tests/testing_zgeqrf_systolic.c
index 8f9db2a8..96e2a0e9 100644
--- a/tests/testing_zgeqrf_systolic.c
+++ b/tests/testing_zgeqrf_systolic.c
@@ -457,7 +457,7 @@ static void warmup_zgeqrf_systolic(int rank, int random_seed, int *iparam, parse
     for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
         for(int i = 0; i < (int)zgeqrf_systolic->nb_task_classes; i++) {
             for(int j = 0; NULL != zgeqrf_systolic->task_classes_array[i]->incarnations[j].hook; j++) {
-                if( zgeqrf_systolic->task_classes_array[i]->incarnations[j].type == dtype ) {
+                if( zgeqrf_systolic->task_classes_array[i]->incarnations[j].type & dtype ) {
                     goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
                 }
             }
@@ -466,7 +466,7 @@ static void warmup_zgeqrf_systolic(int rank, int random_seed, int *iparam, parse
     do_run:
         for(int did = 0; did < (int)parsec_nb_devices; did++) {
             parsec_device_module_t *dev = parsec_mca_device_get(did);
-            if(dev->type != dtype)
+            if( !(dev->type & dtype) )
                 continue;
             /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
             for(int m = 0; m < MT; m++) {
diff --git a/tests/testing_zgesvd.c b/tests/testing_zgesvd.c
index 02787cd7..70c04287 100644
--- a/tests/testing_zgesvd.c
+++ b/tests/testing_zgesvd.c
@@ -330,7 +330,7 @@ static void warmup_zgesvd(int rank, int random_seed, parsec_context_t *parsec)
     for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
         for(int i = 0; i < (int)zgesvd->nb_task_classes; i++) {
             for(int j = 0; NULL != zgesvd->task_classes_array[i]->incarnations[j].hook; j++) {
-                if( zgesvd->task_classes_array[i]->incarnations[j].type == dtype ) {
+                if( zgesvd->task_classes_array[i]->incarnations[j].type & dtype ) {
                     goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
                 }
             }
@@ -339,7 +339,7 @@ static void warmup_zgesvd(int rank, int random_seed, parsec_context_t *parsec)
     do_run:
         for(int did = 0; did < (int)parsec_nb_devices; did++) {
             parsec_device_module_t *dev = parsec_mca_device_get(did);
-            if(dev->type != dtype)
+            if( !(dev->type & dtype) )
                 continue;
             /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
             for(int m = 0; m < MT; m++) {
diff --git a/tests/testing_zgetrf_incpiv.c b/tests/testing_zgetrf_incpiv.c
index 2eef974c..6705fd3f 100644
--- a/tests/testing_zgetrf_incpiv.c
+++ b/tests/testing_zgetrf_incpiv.c
@@ -313,7 +313,7 @@ static void warmup_zgetrf(int rank, int random_seed, parsec_context_t *parsec)
     for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
         for(int i = 0; i < (int)zgetrf_incpiv->nb_task_classes; i++) {
             for(int j = 0; NULL != zgetrf_incpiv->task_classes_array[i]->incarnations[j].hook; j++) {
-                if( zgetrf_incpiv->task_classes_array[i]->incarnations[j].type == dtype ) {
+                if( zgetrf_incpiv->task_classes_array[i]->incarnations[j].type & dtype ) {
                     goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
                 }
             }
@@ -322,7 +322,7 @@ static void warmup_zgetrf(int rank, int random_seed, parsec_context_t *parsec)
     do_run:
         for(int did = 0; did < (int)parsec_nb_devices; did++) {
             parsec_device_module_t *dev = parsec_mca_device_get(did);
-            if(dev->type != dtype)
+            if( !(dev->type & dtype) )
                 continue;
             /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
             for(int m = 0; m < MT; m++) {
diff --git a/tests/testing_zgetrf_incpiv_dtd.c b/tests/testing_zgetrf_incpiv_dtd.c
index 298729c9..d98d25ac 100644
--- a/tests/testing_zgetrf_incpiv_dtd.c
+++ b/tests/testing_zgetrf_incpiv_dtd.c
@@ -616,7 +616,7 @@ static void warmup_zgetrf(int rank, int random_seed, parsec_context_t *parsec)
     for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
         for(int i = 0; i < (int)zgetrf_incpiv->nb_task_classes; i++) {
             for(int j = 0; NULL != zgetrf_incpiv->task_classes_array[i]->incarnations[j].hook; j++) {
-                if( zgetrf_incpiv->task_classes_array[i]->incarnations[j].type == dtype ) {
+                if( zgetrf_incpiv->task_classes_array[i]->incarnations[j].type & dtype ) {
                     goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
                 }
             }
@@ -625,7 +625,7 @@ static void warmup_zgetrf(int rank, int random_seed, parsec_context_t *parsec)
     do_run:
         for(int did = 0; did < (int)parsec_nb_devices; did++) {
             parsec_device_module_t *dev = parsec_mca_device_get(did);
-            if(dev->type != dtype)
+            if( !(dev->type & dtype) )
                 continue;
             /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
             for(int m = 0; m < MT; m++) {
diff --git a/tests/testing_zheev.c b/tests/testing_zheev.c
index c2845e50..cbd49002 100644
--- a/tests/testing_zheev.c
+++ b/tests/testing_zheev.c
@@ -350,7 +350,7 @@ static void warmup_zherbt(int rank, int random_seed, int uplo, parsec_context_t
     for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
         for(int i = 0; i < (int)zherbt->nb_task_classes; i++) {
             for(int j = 0; NULL != zherbt->task_classes_array[i]->incarnations[j].hook; j++) {
-                if( zherbt->task_classes_array[i]->incarnations[j].type == dtype ) {
+                if( zherbt->task_classes_array[i]->incarnations[j].type & dtype ) {
                     goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
                 }
             }
@@ -359,7 +359,7 @@ static void warmup_zherbt(int rank, int random_seed, int uplo, parsec_context_t
     do_run:
         for(int did = 0; did < (int)parsec_nb_devices; did++) {
             parsec_device_module_t *dev = parsec_mca_device_get(did);
-            if(dev->type != dtype)
+            if( !(dev->type & dtype) )
                 continue;
             /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
             for(int m = 0; m < MT; m++) {
diff --git a/tests/testing_zpotrf.c b/tests/testing_zpotrf.c
index 7d0fa3b7..68ca276e 100644
--- a/tests/testing_zpotrf.c
+++ b/tests/testing_zpotrf.c
@@ -67,7 +67,6 @@ int main(int argc, char ** argv)
             dplasma_zpotrf_Destruct( PARSEC_zpotrf );
 
             parsec_taskpool_sync_ids(); /* recursive DAGs are not synchronous on ids */
-
         }
         else
         {
@@ -76,7 +75,6 @@ int main(int argc, char ** argv)
                                       dplasma_zpotrf_Destruct( PARSEC_zpotrf ));
         }
         parsec_devices_reset_load(parsec);
-
     }
 
     if( 0 == rank && info != 0 ) {