ICLDisco · bosilca · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024
@@ -150,12 +150,12 @@ stage_in_lapack(parsec_gpu_task_t *gtask,
             assert(ddc != NULL);
             elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype);
             in_elem_dev = (parsec_device_cuda_module_t*)parsec_mca_device_get( copy_in->device_index);
-            if( (in_elem_dev->super.super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){
+            if( (in_elem_dev->super.super.type & PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){
                 ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private,
                                                     copy_in->device_private,
                                                     gtask->flow_nb_elts[i],
-                                                    (in_elem_dev->super.super.type != PARSEC_DEV_CUDA)?
-                                                            cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice,
+                                                    (in_elem_dev->super.super.type & PARSEC_DEV_CUDA)?
+                                                            cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice,
                                                     cuda_stream->cuda_stream);
                 PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } );
             }else{
@@ -227,12 +227,12 @@ stage_out_lapack(parsec_gpu_task_t *gtask,
             elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype);
             out_elem_dev = (parsec_device_cuda_module_t*)parsec_mca_device_get( copy_out->device_index);
 
-            if( (out_elem_dev->super.super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){
+            if( (out_elem_dev->super.super.type & PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){
                 ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private,
                                                     copy_in->device_private,
                                                     gtask->flow_nb_elts[i],
-                                                    out_elem_dev->super.super.type != PARSEC_DEV_CUDA ?
-                                                            cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice,
+                                                    out_elem_dev->super.super.type & PARSEC_DEV_CUDA ?
+                                                            cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost,
                                                     cuda_stream->cuda_stream);
                 PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } );
             }else{

@@ -413,7 +413,7 @@ BODY [type=CUDA]
         __parsec_zgemm_NN_gpu_GEMM_task_t next_gemm;
         memcpy(&next_gemm, this_task, sizeof(__parsec_zgemm_NN_gpu_GEMM_task_t));
         next_gemm.locals.k.value = descC->mt -1;
-        assert( PARSEC_DEV_CUDA == next_gemm.task_class->incarnations[chore_id].type );
+        assert( PARSEC_DEV_CUDA & next_gemm.task_class->incarnations[chore_id].type );
         if(NULL != next_gemm.task_class->incarnations[chore_id].evaluate) {
             if( next_gemm.task_class->incarnations[chore_id].evaluate((parsec_task_t*)&next_gemm) ==
                 PARSEC_HOOK_RETURN_NEXT ) {

@@ -222,7 +222,7 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB,
     nbgpu = 0;
     for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
         parsec_device_module_t *device = parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)device;
             nbgpu++;
             if( 0 == gpu_mem_block_size )
@@ -239,7 +239,7 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB,
     nbgpu= 0;
     for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
         parsec_device_module_t *device = parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             dev_index[nbgpu++] = device->device_index;
         }
     }
@@ -461,7 +461,7 @@ dplasma_zgemm_New_ex( dplasma_enum_t transA, dplasma_enum_t transB,
 		int64_t gpu_mem_nb_blocks = -1;
         for(devid = 0; devid < (int)parsec_nb_devices; devid++) {
             parsec_device_module_t *device = parsec_mca_device_get(devid);
-            if( PARSEC_DEV_CUDA == device->type ) {
+            if( PARSEC_DEV_CUDA & device->type ) {
 				parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)device;
                 nb_gpu_devices++;
 				if( 0 == gpu_mem_block_size )

@@ -134,6 +134,7 @@ BODY [type=CUDA]
     handles = parsec_info_get(&gpu_stream->infos, CuHandlesID);
     assert(NULL != handles);
     wp = parsec_info_get(&gpu_device->super.infos, POWorkspaceID);
+    if( NULL == wp ) return PARSEC_HOOK_RETURN_AGAIN;
     assert(NULL != wp);
 
     workspace = (cuDoubleComplex*)wp->tmpmem;

@@ -58,11 +58,11 @@ dplasma_zpotrf_setrecursive( parsec_taskpool_t *tp, int hmb )
     }
 }
 
+#define USE_PARSEC_ZONE_FOR_WP_MEMORY 0
+
 #if defined(DPLASMA_HAVE_CUDA)
 void *zpotrf_create_workspace(void *obj, void *user)
 {
-    parsec_device_module_t *mod = (parsec_device_module_t *)obj;
-    zone_malloc_t *memory = ((parsec_device_cuda_module_t*)mod)->super.memory;
     cusolverDnHandle_t cusolverDnHandle;
     cusolverStatus_t status;
     parsec_zpotrf_U_taskpool_t *tp = (parsec_zpotrf_U_taskpool_t*)user;
@@ -73,6 +73,7 @@ void *zpotrf_create_workspace(void *obj, void *user)
     size_t elt_size = sizeof(cuDoubleComplex);
     cublasFillMode_t cublas_uplo;
     dplasma_enum_t uplo = tp->_g_uplo;
+    void* tmpmem;
 
     if( PlasmaLower == uplo )
         cublas_uplo = CUBLAS_FILL_MODE_LOWER;
@@ -87,20 +88,39 @@ void *zpotrf_create_workspace(void *obj, void *user)
     assert(CUSOLVER_STATUS_SUCCESS == status);
 
     cusolverDnDestroy(cusolverDnHandle);
-
+#if USE_PARSEC_ZONE_FOR_WP_MEMORY
+    parsec_device_module_t *mod = (parsec_device_module_t *)obj;
+    zone_malloc_t *memory = ((parsec_device_cuda_module_t*)mod)->super.memory;
+    tmpmem = zone_malloc(memory, workspace_size * elt_size + sizeof(int));
+    if( NULL == tmpmem )
+        return NULL;
+#else
+    (void)obj;
+    cudaError_t rc = cudaMalloc(&tmpmem, workspace_size * elt_size + sizeof(int));
+    if( cudaSuccess != rc )
+        return NULL;
+#endif  /* USE_PARSEC_ZONE_FOR_WP_MEMORY */
     wp = (dplasma_potrf_workspace_t*)malloc(sizeof(dplasma_potrf_workspace_t));
-    wp->tmpmem = zone_malloc(memory, workspace_size * elt_size + sizeof(int));
+    wp->tmpmem = tmpmem;
     assert(NULL != wp->tmpmem);
     wp->lwork = workspace_size;
+#if USE_PARSEC_ZONE_FOR_WP_MEMORY
     wp->memory = memory;
+#else
+    wp->memory = NULL;
+#endif  /* USE_PARSEC_ZONE_FOR_WP_MEMORY */
 
     return wp;
 }
 
 static void destroy_workspace(void *_ws, void *_n)
 {
     dplasma_potrf_workspace_t *ws = (dplasma_potrf_workspace_t*)_ws;
+#if USE_PARSEC_ZONE_FOR_WP_MEMORY
     zone_free((zone_malloc_t*)ws->memory, ws->tmpmem);
+#else
+    (void)cudaFree(ws->tmpmem);
+#endif  /* USE_PARSEC_ZONE_FOR_WP_MEMORY */
     free(ws);
     (void)_n;
 }

@@ -124,6 +124,7 @@ void print_usage(void)
             "\n"
             " -c --cores        : number of concurent threads (default: number of physical hyper-threads)\n"
             " -g --gpus         : number of GPU (default: 0)\n"
+	    " -D --gpu_mask     : mask of the GPUs to be used by this process either a positive number to indicate the GPU mask or -1 (to use the GPUs module the local rank of the process)\n"
             " -m --thread_multi : initialize MPI_THREAD_MULTIPLE (default: no)\n"
             " -o --scheduler    : select the scheduler (default: LFQ)\n"
             "                     Accepted values:\n"
@@ -181,6 +182,7 @@ static struct option long_options[] =
     {"scheduler",   required_argument,  0, 'o'},
     {"gpus",        required_argument,  0, 'g'},
     {"g",           required_argument,  0, 'g'},
+    {"D",           required_argument,  0, 'D'},
     {"V",           required_argument,  0, 'V'},
     {"vpmap",       required_argument,  0, 'V'},
     {"ht",          required_argument,  0, 'H'},
@@ -324,6 +326,16 @@ static void read_arguments(int *_argc, char*** _argv, int* iparam)
                 }
                 iparam[IPARAM_NGPUS] = atoi(optarg);
                 break;
+            case 'D':
+#if !defined(DPLASMA_HAVE_CUDA)
+                iparam[IPARAM_GPU_MASK] = DPLASMA_ERR_NOT_SUPPORTED; /* force an error message */
+#endif
+                if(iparam[IPARAM_GPU_MASK] == DPLASMA_ERR_NOT_SUPPORTED) {
+                    fprintf(stderr, "#!!!!! This test does not have GPU support. GPU disabled.\n");
+                    break;
+                }
+                iparam[IPARAM_GPU_MASK] = atoi(optarg);
+                break;
 
             case 'p': case 'P': iparam[IPARAM_P] = atoi(optarg); break;
             case 'q': case 'Q': iparam[IPARAM_Q] = atoi(optarg); break;
@@ -448,7 +460,6 @@ static void parse_arguments(int *iparam) {
         parsec_setenv_mca_param( "device_cuda_enabled", value, &environ );
         free(value);
     }
-
     /* Check the process grid */
     if(0 == iparam[IPARAM_P])
         iparam[IPARAM_P] = iparam[IPARAM_NNODES];
@@ -510,6 +521,25 @@ static void parse_arguments(int *iparam) {
     /* HQR */
     if(-'P' == iparam[IPARAM_QR_HLVL_SZE]) iparam[IPARAM_QR_HLVL_SZE] = iparam[IPARAM_P];
     if(-'Q' == iparam[IPARAM_QR_HLVL_SZE]) iparam[IPARAM_QR_HLVL_SZE] = iparam[IPARAM_Q];
+#if defined(DPLASMA_HAVE_CUDA) && defined(PARSEC_HAVE_MPI)
+    if(iparam[IPARAM_NGPUS] > 0 && (-1 == iparam[IPARAM_GPU_MASK])) {
+        MPI_Comm local_comm;
+        int local_rank, local_size;
+        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
+                            MPI_INFO_NULL, &local_comm);
+        MPI_Comm_rank(local_comm, &local_rank);
+        MPI_Comm_size(local_comm, &local_size);
+        MPI_Comm_free(&local_comm);
+        iparam[IPARAM_GPU_MASK] = 0;
+        for( int i = 0; i <= iparam[IPARAM_NGPUS]; i++ ) {
+            iparam[IPARAM_GPU_MASK] |= ((1 << local_rank) << i);
+        }
+        char* value;
+        asprintf(&value, "%d", iparam[IPARAM_GPU_MASK]);
+        parsec_setenv_mca_param("device_cuda_mask", value, &environ);
+        free(value); value = NULL;
+    }
+#endif  /* defined(DPLASMA_HAVE_CUDA) && defined(PARSEC_HAVE_MPI) */
 }
 
 static void print_arguments(int* iparam)
@@ -698,21 +728,23 @@ parsec_context_t* setup_parsec(int argc, char **argv, int *iparam)
     int dev, nbgpu = 0;
     for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
         parsec_device_module_t *device = parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             nbgpu++;
         }
     }
-    if( nbgpu > 0 ) {
+    if( iparam[IPARAM_NGPUS] < 0 ) {
+        iparam[IPARAM_NGPUS] = nbgpu;
+    }
+    if( iparam[IPARAM_NGPUS] > 0 ) {
+        if(iparam[IPARAM_VERBOSE] >= 3) {
+            parsec_setenv_mca_param( "device_show_statistics", "1", &environ );
+        }
         CuHI = parsec_info_register(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES",
-                                   dplasma_destroy_cuda_handles, NULL,
-                                   dplasma_create_cuda_handles, NULL,
-                                   NULL);
+                                    dplasma_destroy_cuda_handles, NULL,
+                                    dplasma_create_cuda_handles, NULL,
+                                    NULL);
         assert(-1 != CuHI);
     }
-    iparam[IPARAM_NGPUS] = nbgpu;
-    if(iparam[IPARAM_NGPUS] > 0 && iparam[IPARAM_VERBOSE] >= 3) {
-        parsec_setenv_mca_param( "device_show_statistics", "1", &environ );
-    }
 #endif
 
     print_arguments(iparam);

@@ -74,6 +74,7 @@ enum iparam_t {
   IPARAM_BUT_LEVEL,    /* Butterfly level */
   IPARAM_SCHEDULER,    /* User-selected scheduler */
   IPARAM_NRUNS,        /* Number of times to run the kernel */
+  IPARAM_GPU_MASK,     /* The mask of GPU to be used */
   IPARAM_SIZEOF
 };
 

@@ -417,7 +417,7 @@ static void warmup_zgelqf(int rank, int random_seed, parsec_context_t *parsec)
     for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
         for(int i = 0; i < (int)zgelqf->nb_task_classes; i++) {
             for(int j = 0; NULL != zgelqf->task_classes_array[i]->incarnations[j].hook; j++) {
-                if( zgelqf->task_classes_array[i]->incarnations[j].type == dtype ) {
+                if( zgelqf->task_classes_array[i]->incarnations[j].type & dtype ) {
                     goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
                 }
             }
@@ -426,7 +426,7 @@ static void warmup_zgelqf(int rank, int random_seed, parsec_context_t *parsec)
     do_run:
         for(int did = 0; did < (int)parsec_nb_devices; did++) {
             parsec_device_module_t *dev = parsec_mca_device_get(did);
-            if(dev->type != dtype)
+            if( !(dev->type & dtype) )
                 continue;
             /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
             for(int m = 0; m < MT; m++) {

@@ -471,7 +471,7 @@ static void warmup_hqr(parsec_context_t *parsec, int *iparam)
     for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
         for(int i = 0; i < (int)zgelqf_param_tp->nb_task_classes; i++) {
             for(int j = 0; NULL != zgelqf_param_tp->task_classes_array[i]->incarnations[j].hook; j++) {
-                if( zgelqf_param_tp->task_classes_array[i]->incarnations[j].type == dtype ) {
+                if( zgelqf_param_tp->task_classes_array[i]->incarnations[j].type & dtype ) {
                     goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
                 }
             }
@@ -480,7 +480,7 @@ static void warmup_hqr(parsec_context_t *parsec, int *iparam)
     do_run:
         for(int did = 0; did < (int)parsec_nb_devices; did++) {
             parsec_device_module_t *dev = parsec_mca_device_get(did);
-            if(dev->type != dtype)
+            if( !(dev->type & dtype) )
                 continue;
             /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
             for(int m = 0; m < MT; m++) {

@@ -465,7 +465,7 @@ static void warmup_hqr(parsec_context_t *parsec, int *iparam)
     for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
         for(int i = 0; i < (int)zgelqf_sys_tp->nb_task_classes; i++) {
             for(int j = 0; NULL != zgelqf_sys_tp->task_classes_array[i]->incarnations[j].hook; j++) {
-                if( zgelqf_sys_tp->task_classes_array[i]->incarnations[j].type == dtype ) {
+                if( zgelqf_sys_tp->task_classes_array[i]->incarnations[j].type & dtype ) {
                     goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
                 }
             }
@@ -474,7 +474,7 @@ static void warmup_hqr(parsec_context_t *parsec, int *iparam)
     do_run:
         for(int did = 0; did < (int)parsec_nb_devices; did++) {
             parsec_device_module_t *dev = parsec_mca_device_get(did);
-            if(dev->type != dtype)
+            if( !(dev->type & dtype) )
                 continue;
             /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
             for(int m = 0; m < MT; m++) {

@@ -471,7 +471,7 @@ static void warmup_zgeqrf_hqr(int rank, int random_seed, int *iparam, parsec_con
     for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
         for(int i = 0; i < (int)zgeqrf_hqr->nb_task_classes; i++) {
             for(int j = 0; NULL != zgeqrf_hqr->task_classes_array[i]->incarnations[j].hook; j++) {
-                if( zgeqrf_hqr->task_classes_array[i]->incarnations[j].type == dtype ) {
+                if( zgeqrf_hqr->task_classes_array[i]->incarnations[j].type & dtype ) {
                     goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
                 }
             }
@@ -480,7 +480,7 @@ static void warmup_zgeqrf_hqr(int rank, int random_seed, int *iparam, parsec_con
     do_run:
         for(int did = 0; did < (int)parsec_nb_devices; did++) {
             parsec_device_module_t *dev = parsec_mca_device_get(did);
-            if(dev->type != dtype)
+            if( !(dev->type & dtype) )
                 continue;
             /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
             for(int m = 0; m < MT; m++) {

@@ -457,7 +457,7 @@ static void warmup_zgeqrf_systolic(int rank, int random_seed, int *iparam, parse
     for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
         for(int i = 0; i < (int)zgeqrf_systolic->nb_task_classes; i++) {
             for(int j = 0; NULL != zgeqrf_systolic->task_classes_array[i]->incarnations[j].hook; j++) {
-                if( zgeqrf_systolic->task_classes_array[i]->incarnations[j].type == dtype ) {
+                if( zgeqrf_systolic->task_classes_array[i]->incarnations[j].type & dtype ) {
                     goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
                 }
             }
@@ -466,7 +466,7 @@ static void warmup_zgeqrf_systolic(int rank, int random_seed, int *iparam, parse
     do_run:
         for(int did = 0; did < (int)parsec_nb_devices; did++) {
             parsec_device_module_t *dev = parsec_mca_device_get(did);
-            if(dev->type != dtype)
+            if( !(dev->type & dtype) )
                 continue;
             /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
             for(int m = 0; m < MT; m++) {

@@ -330,7 +330,7 @@ static void warmup_zgesvd(int rank, int random_seed, parsec_context_t *parsec)
     for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
         for(int i = 0; i < (int)zgesvd->nb_task_classes; i++) {
             for(int j = 0; NULL != zgesvd->task_classes_array[i]->incarnations[j].hook; j++) {
-                if( zgesvd->task_classes_array[i]->incarnations[j].type == dtype ) {
+                if( zgesvd->task_classes_array[i]->incarnations[j].type & dtype ) {
                     goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
                 }
             }
@@ -339,7 +339,7 @@ static void warmup_zgesvd(int rank, int random_seed, parsec_context_t *parsec)
     do_run:
         for(int did = 0; did < (int)parsec_nb_devices; did++) {
             parsec_device_module_t *dev = parsec_mca_device_get(did);
-            if(dev->type != dtype)
+            if( !(dev->type & dtype) )
                 continue;
             /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
             for(int m = 0; m < MT; m++) {