Handle failures during OpenCL context init gracefully. (#2078)

There are still a few places where we do OPENCL_SUCCEED_FATAL(), but mostly in places where failure implies something is completely wonky with the system.
diku-dk · Jan 9, 2024 · cccac0b · cccac0b
1 parent 3b242d8
commit cccac0b
Show file tree

Hide file tree

Showing 7 changed files with 83 additions and 38 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -36,6 +36,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
   result in circular sizes, which usually manifested as the type
   checker going into an infinite loop (#2073).
 
+* The OpenCL backend now more gracefully handles lack of platform.
+
 ## [0.25.11]
 
 ### Added

diff --git a/rts/c/backends/c.h b/rts/c/backends/c.h
@@ -43,6 +43,7 @@ struct futhark_context {
   int64_t peak_mem_usage_default;
   int64_t cur_mem_usage_default;
   struct program* program;
+  bool program_initialised;
 };
 
 int backend_context_setup(struct futhark_context* ctx) {

diff --git a/rts/c/backends/cuda.h b/rts/c/backends/cuda.h
@@ -273,6 +273,8 @@ struct futhark_context {
   struct event_list event_list;
   int64_t peak_mem_usage_default;
   int64_t cur_mem_usage_default;
+  struct program* program;
+  bool program_initialised;
   // Uniform fields above.
 
   CUdeviceptr global_failure;
@@ -284,7 +286,6 @@ struct futhark_context {
   long int total_runtime;
   int64_t peak_mem_usage_device;
   int64_t cur_mem_usage_device;
-  struct program* program;
 
   CUdevice dev;
   CUcontext cu_ctx;
@@ -819,6 +820,7 @@ int backend_context_setup(struct futhark_context* ctx) {
   ctx->total_runtime = 0;
   ctx->peak_mem_usage_device = 0;
   ctx->cur_mem_usage_device = 0;
+  ctx->kernels = NULL;
 
   CUDA_SUCCEED_FATAL(cuInit(0));
   if (cuda_device_setup(ctx) != 0) {
@@ -868,13 +870,16 @@ int backend_context_setup(struct futhark_context* ctx) {
 }
 
 void backend_context_teardown(struct futhark_context* ctx) {
-  free_builtin_kernels(ctx, ctx->kernels);
-  cuMemFree(ctx->global_failure);
-  cuMemFree(ctx->global_failure_args);
-  CUDA_SUCCEED_FATAL(gpu_free_all(ctx));
-  CUDA_SUCCEED_FATAL(cuStreamDestroy(ctx->stream));
-  CUDA_SUCCEED_FATAL(cuModuleUnload(ctx->module));
-  CUDA_SUCCEED_FATAL(cuCtxDestroy(ctx->cu_ctx));
+  if (ctx->kernels != NULL) {
+    free_builtin_kernels(ctx, ctx->kernels);
+    cuMemFree(ctx->global_failure);
+    cuMemFree(ctx->global_failure_args);
+    CUDA_SUCCEED_FATAL(gpu_free_all(ctx));
+    CUDA_SUCCEED_FATAL(cuStreamDestroy(ctx->stream));
+    CUDA_SUCCEED_FATAL(cuModuleUnload(ctx->module));
+    CUDA_SUCCEED_FATAL(cuCtxDestroy(ctx->cu_ctx));
+  }
+  free_list_destroy(&ctx->gpu_free_list);
 }
 
 // GPU ABSTRACTION LAYER

diff --git a/rts/c/backends/hip.h b/rts/c/backends/hip.h
@@ -248,6 +248,7 @@ struct futhark_context {
   struct event_list event_list;
   int64_t peak_mem_usage_default;
   int64_t cur_mem_usage_default;
+  bool program_initialised;
   // Uniform fields above.
 
   void* global_failure;
@@ -676,6 +677,7 @@ int backend_context_setup(struct futhark_context* ctx) {
   ctx->total_runtime = 0;
   ctx->peak_mem_usage_device = 0;
   ctx->cur_mem_usage_device = 0;
+  ctx->kernels = NULL;
 
   HIP_SUCCEED_FATAL(hipInit(0));
   if (hip_device_setup(ctx) != 0) {
@@ -724,12 +726,15 @@ int backend_context_setup(struct futhark_context* ctx) {
 }
 
 void backend_context_teardown(struct futhark_context* ctx) {
-  free_builtin_kernels(ctx, ctx->kernels);
-  hipFree(ctx->global_failure);
-  hipFree(ctx->global_failure_args);
-  HIP_SUCCEED_FATAL(gpu_free_all(ctx));
-  HIP_SUCCEED_FATAL(hipStreamDestroy(ctx->stream));
-  HIP_SUCCEED_FATAL(hipModuleUnload(ctx->module));
+  if (ctx->kernels != NULL) {
+    free_builtin_kernels(ctx, ctx->kernels);
+    hipFree(ctx->global_failure);
+    hipFree(ctx->global_failure_args);
+    HIP_SUCCEED_FATAL(gpu_free_all(ctx));
+    HIP_SUCCEED_FATAL(hipStreamDestroy(ctx->stream));
+    HIP_SUCCEED_FATAL(hipModuleUnload(ctx->module));
+  }
+  free_list_destroy(&ctx->gpu_free_list);
 }
 
 // GPU ABSTRACTION LAYER

diff --git a/rts/c/backends/multicore.h b/rts/c/backends/multicore.h
@@ -50,6 +50,7 @@ struct futhark_context {
   int64_t peak_mem_usage_default;
   int64_t cur_mem_usage_default;
   struct program* program;
+  bool program_initialised;
   // Uniform fields above.
 
   lock_t event_list_lock;

diff --git a/rts/c/backends/opencl.h b/rts/c/backends/opencl.h
@@ -513,8 +513,8 @@ struct futhark_context {
   int64_t peak_mem_usage_default;
   int64_t cur_mem_usage_default;
   struct program* program;
-
-  // Common fields above.
+  bool program_initialised;
+  // Uniform fields above.
 
   cl_mem global_failure;
   cl_mem global_failure_args;
@@ -546,7 +546,7 @@ struct futhark_context {
   struct builtin_kernels* kernels;
 };
 
-static cl_build_status build_gpu_program(cl_program program, cl_device_id device, const char* options) {
+static cl_build_status build_gpu_program(cl_program program, cl_device_id device, const char* options, char** log) {
   cl_int clBuildProgram_error = clBuildProgram(program, 1, &device, options, NULL, NULL);
 
   // Avoid termination due to CL_BUILD_PROGRAM_FAILURE
@@ -563,20 +563,18 @@ static cl_build_status build_gpu_program(cl_program program, cl_device_id device
                                              &build_status,
                                              NULL));
 
-  if (build_status != CL_SUCCESS) {
+  if (build_status != CL_BUILD_SUCCESS) {
     char *build_log;
     size_t ret_val_size;
     OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size));
 
     build_log = (char*) malloc(ret_val_size+1);
     OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL));
 
-    // The spec technically does not say whether the build log is zero-terminated, so let's be careful.
+    // The spec technically does not say whether the build log is
+    // zero-terminated, so let's be careful.
     build_log[ret_val_size] = '\0';
-
-    fprintf(stderr, "Build log:\n%s\n", build_log);
-
-    free(build_log);
+    *log = build_log;
   }
 
   return build_status;
@@ -977,10 +975,25 @@ static void setup_opencl_with_command_queue(struct futhark_context *ctx,
   if (ctx->cfg->logging) {
     fprintf(stderr, "Building OpenCL program...\n");
   }
-  OPENCL_SUCCEED_FATAL(build_gpu_program(prog, device_option.device, compile_opts));
-
+  char* build_log;
+  cl_build_status status =
+    build_gpu_program(prog, device_option.device, compile_opts, &build_log);
   free(compile_opts);
 
+  if (status != CL_BUILD_SUCCESS) {
+    ctx->error = msgprintf("Compilation of OpenCL program failed.\nBuild log:\n%s",
+                           build_log);
+    // We are giving up on initialising this OpenCL context. That also
+    // means we need to free all the OpenCL bits we have managed to
+    // allocate thus far, as futhark_context_free() will not touch
+    // these unless initialisation was completely successful.
+    (void)clReleaseProgram(prog);
+    (void)clReleaseCommandQueue(ctx->queue);
+    (void)clReleaseContext(ctx->ctx);
+    free(build_log);
+    return;
+  }
+
   size_t binary_size = 0;
   unsigned char *binary = NULL;
   int store_in_cache = cache_fname != NULL && !loaded_from_cache;
@@ -1011,7 +1024,8 @@ static void setup_opencl_with_command_queue(struct futhark_context *ctx,
   ctx->clprogram = prog;
 }
 
-static struct opencl_device_option get_preferred_device(const struct futhark_context_config *cfg) {
+static struct opencl_device_option get_preferred_device(struct futhark_context *ctx,
+                                                        const struct futhark_context_config *cfg) {
   struct opencl_device_option *devices;
   size_t num_devices;
 
@@ -1038,14 +1052,19 @@ static struct opencl_device_option get_preferred_device(const struct futhark_con
     }
   }
 
-  futhark_panic(1, "Could not find acceptable OpenCL device.\n");
-  exit(1); // Never reached
+  ctx->error = strdup("Could not find acceptable OpenCL device.\n");
+  struct opencl_device_option device;
+  return device;
 }
 
 static void setup_opencl(struct futhark_context *ctx,
                          const char *extra_build_opts[],
                          const char* cache_fname) {
-  struct opencl_device_option device_option = get_preferred_device(ctx->cfg);
+  struct opencl_device_option device_option = get_preferred_device(ctx, ctx->cfg);
+
+  if (ctx->error != NULL) {
+    return;
+  }
 
   if (ctx->cfg->logging) {
     fprintf(stderr, "Using platform: %s\n", device_option.platform_name);
@@ -1084,13 +1103,18 @@ int backend_context_setup(struct futhark_context* ctx) {
   ctx->total_runtime = 0;
   ctx->peak_mem_usage_device = 0;
   ctx->cur_mem_usage_device = 0;
+  ctx->kernels = NULL;
 
   if (ctx->cfg->queue_set) {
     setup_opencl_with_command_queue(ctx, ctx->cfg->queue, (const char**)ctx->cfg->build_opts, ctx->cfg->cache_fname);
   } else {
     setup_opencl(ctx, (const char**)ctx->cfg->build_opts, ctx->cfg->cache_fname);
   }
 
+  if (ctx->error != NULL) {
+    return 1;
+  }
+
   cl_int error;
   cl_int no_error = -1;
   ctx->global_failure =
@@ -1116,13 +1140,16 @@ int backend_context_setup(struct futhark_context* ctx) {
 static int gpu_free_all(struct futhark_context *ctx);
 
 void backend_context_teardown(struct futhark_context* ctx) {
-  free_builtin_kernels(ctx, ctx->kernels);
-  OPENCL_SUCCEED_FATAL(clReleaseMemObject(ctx->global_failure));
-  OPENCL_SUCCEED_FATAL(clReleaseMemObject(ctx->global_failure_args));
-  (void)gpu_free_all(ctx);
-  (void)clReleaseProgram(ctx->clprogram);
-  (void)clReleaseCommandQueue(ctx->queue);
-  (void)clReleaseContext(ctx->ctx);
+  if (ctx->kernels != NULL) {
+    free_builtin_kernels(ctx, ctx->kernels);
+    OPENCL_SUCCEED_FATAL(clReleaseMemObject(ctx->global_failure));
+    OPENCL_SUCCEED_FATAL(clReleaseMemObject(ctx->global_failure_args));
+    (void)gpu_free_all(ctx);
+    (void)clReleaseProgram(ctx->clprogram);
+    (void)clReleaseCommandQueue(ctx->queue);
+    (void)clReleaseContext(ctx->ctx);
+  }
+  free_list_destroy(&ctx->gpu_free_list);
 }
 
 cl_command_queue futhark_context_get_command_queue(struct futhark_context* ctx) {

diff --git a/rts/c/context.h b/rts/c/context.h
@@ -116,6 +116,7 @@ struct futhark_context* futhark_context_new(struct futhark_context_config* cfg)
   assert(!cfg->in_use);
   ctx->cfg = cfg;
   ctx->cfg->in_use = 1;
+  ctx->program_initialised = false;
   create_lock(&ctx->error_lock);
   create_lock(&ctx->lock);
   free_list_init(&ctx->free_list);
@@ -134,15 +135,18 @@ struct futhark_context* futhark_context_new(struct futhark_context_config* cfg)
   if (backend_context_setup(ctx) == 0) {
     setup_program(ctx);
     init_constants(ctx);
+    ctx->program_initialised = true;
     (void)futhark_context_clear_caches(ctx);
     (void)futhark_context_sync(ctx);
   }
   return ctx;
 }
 
 void futhark_context_free(struct futhark_context* ctx) {
-  free_constants(ctx);
-  teardown_program(ctx);
+  if (ctx->program_initialised) {
+    free_constants(ctx);
+    teardown_program(ctx);
+  }
   backend_context_teardown(ctx);
   free_all_in_free_list(ctx);
   free_list_destroy(&ctx->free_list);