Skip to content

Commit

Permalink
Handle failures during OpenCL context init gracefully. (#2078)
Browse files Browse the repository at this point in the history
There are still a few places where we do OPENCL_SUCCEED_FATAL(), but
mostly in places where failure implies something is completely wonky
with the system.
  • Loading branch information
athas authored Jan 9, 2024
1 parent 3b242d8 commit cccac0b
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 38 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
result in circular sizes, which usually manifested as the type
checker going into an infinite loop (#2073).

* The OpenCL backend now more gracefully handles lack of platform.

## [0.25.11]

### Added
Expand Down
1 change: 1 addition & 0 deletions rts/c/backends/c.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ struct futhark_context {
int64_t peak_mem_usage_default;
int64_t cur_mem_usage_default;
struct program* program;
bool program_initialised;
};

int backend_context_setup(struct futhark_context* ctx) {
Expand Down
21 changes: 13 additions & 8 deletions rts/c/backends/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,8 @@ struct futhark_context {
struct event_list event_list;
int64_t peak_mem_usage_default;
int64_t cur_mem_usage_default;
struct program* program;
bool program_initialised;
// Uniform fields above.

CUdeviceptr global_failure;
Expand All @@ -284,7 +286,6 @@ struct futhark_context {
long int total_runtime;
int64_t peak_mem_usage_device;
int64_t cur_mem_usage_device;
struct program* program;

CUdevice dev;
CUcontext cu_ctx;
Expand Down Expand Up @@ -819,6 +820,7 @@ int backend_context_setup(struct futhark_context* ctx) {
ctx->total_runtime = 0;
ctx->peak_mem_usage_device = 0;
ctx->cur_mem_usage_device = 0;
ctx->kernels = NULL;

CUDA_SUCCEED_FATAL(cuInit(0));
if (cuda_device_setup(ctx) != 0) {
Expand Down Expand Up @@ -868,13 +870,16 @@ int backend_context_setup(struct futhark_context* ctx) {
}

void backend_context_teardown(struct futhark_context* ctx) {
free_builtin_kernels(ctx, ctx->kernels);
cuMemFree(ctx->global_failure);
cuMemFree(ctx->global_failure_args);
CUDA_SUCCEED_FATAL(gpu_free_all(ctx));
CUDA_SUCCEED_FATAL(cuStreamDestroy(ctx->stream));
CUDA_SUCCEED_FATAL(cuModuleUnload(ctx->module));
CUDA_SUCCEED_FATAL(cuCtxDestroy(ctx->cu_ctx));
if (ctx->kernels != NULL) {
free_builtin_kernels(ctx, ctx->kernels);
cuMemFree(ctx->global_failure);
cuMemFree(ctx->global_failure_args);
CUDA_SUCCEED_FATAL(gpu_free_all(ctx));
CUDA_SUCCEED_FATAL(cuStreamDestroy(ctx->stream));
CUDA_SUCCEED_FATAL(cuModuleUnload(ctx->module));
CUDA_SUCCEED_FATAL(cuCtxDestroy(ctx->cu_ctx));
}
free_list_destroy(&ctx->gpu_free_list);
}

// GPU ABSTRACTION LAYER
Expand Down
17 changes: 11 additions & 6 deletions rts/c/backends/hip.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ struct futhark_context {
struct event_list event_list;
int64_t peak_mem_usage_default;
int64_t cur_mem_usage_default;
bool program_initialised;
// Uniform fields above.

void* global_failure;
Expand Down Expand Up @@ -676,6 +677,7 @@ int backend_context_setup(struct futhark_context* ctx) {
ctx->total_runtime = 0;
ctx->peak_mem_usage_device = 0;
ctx->cur_mem_usage_device = 0;
ctx->kernels = NULL;

HIP_SUCCEED_FATAL(hipInit(0));
if (hip_device_setup(ctx) != 0) {
Expand Down Expand Up @@ -724,12 +726,15 @@ int backend_context_setup(struct futhark_context* ctx) {
}

void backend_context_teardown(struct futhark_context* ctx) {
free_builtin_kernels(ctx, ctx->kernels);
hipFree(ctx->global_failure);
hipFree(ctx->global_failure_args);
HIP_SUCCEED_FATAL(gpu_free_all(ctx));
HIP_SUCCEED_FATAL(hipStreamDestroy(ctx->stream));
HIP_SUCCEED_FATAL(hipModuleUnload(ctx->module));
if (ctx->kernels != NULL) {
free_builtin_kernels(ctx, ctx->kernels);
hipFree(ctx->global_failure);
hipFree(ctx->global_failure_args);
HIP_SUCCEED_FATAL(gpu_free_all(ctx));
HIP_SUCCEED_FATAL(hipStreamDestroy(ctx->stream));
HIP_SUCCEED_FATAL(hipModuleUnload(ctx->module));
}
free_list_destroy(&ctx->gpu_free_list);
}

// GPU ABSTRACTION LAYER
Expand Down
1 change: 1 addition & 0 deletions rts/c/backends/multicore.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ struct futhark_context {
int64_t peak_mem_usage_default;
int64_t cur_mem_usage_default;
struct program* program;
bool program_initialised;
// Uniform fields above.

lock_t event_list_lock;
Expand Down
71 changes: 49 additions & 22 deletions rts/c/backends/opencl.h
Original file line number Diff line number Diff line change
Expand Up @@ -513,8 +513,8 @@ struct futhark_context {
int64_t peak_mem_usage_default;
int64_t cur_mem_usage_default;
struct program* program;

// Common fields above.
bool program_initialised;
// Uniform fields above.

cl_mem global_failure;
cl_mem global_failure_args;
Expand Down Expand Up @@ -546,7 +546,7 @@ struct futhark_context {
struct builtin_kernels* kernels;
};

static cl_build_status build_gpu_program(cl_program program, cl_device_id device, const char* options) {
static cl_build_status build_gpu_program(cl_program program, cl_device_id device, const char* options, char** log) {
cl_int clBuildProgram_error = clBuildProgram(program, 1, &device, options, NULL, NULL);

// Avoid termination due to CL_BUILD_PROGRAM_FAILURE
Expand All @@ -563,20 +563,18 @@ static cl_build_status build_gpu_program(cl_program program, cl_device_id device
&build_status,
NULL));

if (build_status != CL_SUCCESS) {
if (build_status != CL_BUILD_SUCCESS) {
char *build_log;
size_t ret_val_size;
OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size));

build_log = (char*) malloc(ret_val_size+1);
OPENCL_SUCCEED_FATAL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL));

// The spec technically does not say whether the build log is zero-terminated, so let's be careful.
// The spec technically does not say whether the build log is
// zero-terminated, so let's be careful.
build_log[ret_val_size] = '\0';

fprintf(stderr, "Build log:\n%s\n", build_log);

free(build_log);
*log = build_log;
}

return build_status;
Expand Down Expand Up @@ -977,10 +975,25 @@ static void setup_opencl_with_command_queue(struct futhark_context *ctx,
if (ctx->cfg->logging) {
fprintf(stderr, "Building OpenCL program...\n");
}
OPENCL_SUCCEED_FATAL(build_gpu_program(prog, device_option.device, compile_opts));

char* build_log;
cl_build_status status =
build_gpu_program(prog, device_option.device, compile_opts, &build_log);
free(compile_opts);

if (status != CL_BUILD_SUCCESS) {
ctx->error = msgprintf("Compilation of OpenCL program failed.\nBuild log:\n%s",
build_log);
// We are giving up on initialising this OpenCL context. That also
// means we need to free all the OpenCL bits we have managed to
// allocate thus far, as futhark_context_free() will not touch
// these unless initialisation was completely successful.
(void)clReleaseProgram(prog);
(void)clReleaseCommandQueue(ctx->queue);
(void)clReleaseContext(ctx->ctx);
free(build_log);
return;
}

size_t binary_size = 0;
unsigned char *binary = NULL;
int store_in_cache = cache_fname != NULL && !loaded_from_cache;
Expand Down Expand Up @@ -1011,7 +1024,8 @@ static void setup_opencl_with_command_queue(struct futhark_context *ctx,
ctx->clprogram = prog;
}

static struct opencl_device_option get_preferred_device(const struct futhark_context_config *cfg) {
static struct opencl_device_option get_preferred_device(struct futhark_context *ctx,
const struct futhark_context_config *cfg) {
struct opencl_device_option *devices;
size_t num_devices;

Expand All @@ -1038,14 +1052,19 @@ static struct opencl_device_option get_preferred_device(const struct futhark_con
}
}

futhark_panic(1, "Could not find acceptable OpenCL device.\n");
exit(1); // Never reached
ctx->error = strdup("Could not find acceptable OpenCL device.\n");
struct opencl_device_option device;
return device;
}

static void setup_opencl(struct futhark_context *ctx,
const char *extra_build_opts[],
const char* cache_fname) {
struct opencl_device_option device_option = get_preferred_device(ctx->cfg);
struct opencl_device_option device_option = get_preferred_device(ctx, ctx->cfg);

if (ctx->error != NULL) {
return;
}

if (ctx->cfg->logging) {
fprintf(stderr, "Using platform: %s\n", device_option.platform_name);
Expand Down Expand Up @@ -1084,13 +1103,18 @@ int backend_context_setup(struct futhark_context* ctx) {
ctx->total_runtime = 0;
ctx->peak_mem_usage_device = 0;
ctx->cur_mem_usage_device = 0;
ctx->kernels = NULL;

if (ctx->cfg->queue_set) {
setup_opencl_with_command_queue(ctx, ctx->cfg->queue, (const char**)ctx->cfg->build_opts, ctx->cfg->cache_fname);
} else {
setup_opencl(ctx, (const char**)ctx->cfg->build_opts, ctx->cfg->cache_fname);
}

if (ctx->error != NULL) {
return 1;
}

cl_int error;
cl_int no_error = -1;
ctx->global_failure =
Expand All @@ -1116,13 +1140,16 @@ int backend_context_setup(struct futhark_context* ctx) {
static int gpu_free_all(struct futhark_context *ctx);

void backend_context_teardown(struct futhark_context* ctx) {
free_builtin_kernels(ctx, ctx->kernels);
OPENCL_SUCCEED_FATAL(clReleaseMemObject(ctx->global_failure));
OPENCL_SUCCEED_FATAL(clReleaseMemObject(ctx->global_failure_args));
(void)gpu_free_all(ctx);
(void)clReleaseProgram(ctx->clprogram);
(void)clReleaseCommandQueue(ctx->queue);
(void)clReleaseContext(ctx->ctx);
if (ctx->kernels != NULL) {
free_builtin_kernels(ctx, ctx->kernels);
OPENCL_SUCCEED_FATAL(clReleaseMemObject(ctx->global_failure));
OPENCL_SUCCEED_FATAL(clReleaseMemObject(ctx->global_failure_args));
(void)gpu_free_all(ctx);
(void)clReleaseProgram(ctx->clprogram);
(void)clReleaseCommandQueue(ctx->queue);
(void)clReleaseContext(ctx->ctx);
}
free_list_destroy(&ctx->gpu_free_list);
}

cl_command_queue futhark_context_get_command_queue(struct futhark_context* ctx) {
Expand Down
8 changes: 6 additions & 2 deletions rts/c/context.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ struct futhark_context* futhark_context_new(struct futhark_context_config* cfg)
assert(!cfg->in_use);
ctx->cfg = cfg;
ctx->cfg->in_use = 1;
ctx->program_initialised = false;
create_lock(&ctx->error_lock);
create_lock(&ctx->lock);
free_list_init(&ctx->free_list);
Expand All @@ -134,15 +135,18 @@ struct futhark_context* futhark_context_new(struct futhark_context_config* cfg)
if (backend_context_setup(ctx) == 0) {
setup_program(ctx);
init_constants(ctx);
ctx->program_initialised = true;
(void)futhark_context_clear_caches(ctx);
(void)futhark_context_sync(ctx);
}
return ctx;
}

void futhark_context_free(struct futhark_context* ctx) {
free_constants(ctx);
teardown_program(ctx);
if (ctx->program_initialised) {
free_constants(ctx);
teardown_program(ctx);
}
backend_context_teardown(ctx);
free_all_in_free_list(ctx);
free_list_destroy(&ctx->free_list);
Expand Down

0 comments on commit cccac0b

Please sign in to comment.