diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 6978a31924d5fd..e306ba1c108a8d 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -4927,8 +4927,8 @@ GGML_CALL static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend, GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type"); const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0); - SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy( - (char *)tensor->data + offset, data, size).wait())); + SYCL_CHECK(CHECK_TRY_ERROR( + (stream)->memcpy((char *)tensor->data + offset, data, size))); } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ @@ -5181,13 +5181,73 @@ GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, gg return buft_ctx->device == sycl_ctx->device; } +static ggml_backend_event_t +ggml_backend_sycl_event_new(ggml_backend_t backend) { + ggml_backend_sycl_context *sycl_ctx = + (ggml_backend_sycl_context *)backend->context; + + sycl::event *event_ptr = new sycl::event(); + + return new ggml_backend_event{ + /* .backend = */ backend, + /* .context = */ event_ptr, + }; +} + +static void ggml_backend_sycl_event_free(ggml_backend_event_t event) try { + if (event == nullptr) { + return; + } + + if (event->context != nullptr) { + sycl::event *sycl_event = static_cast(event->context); + delete sycl_event; + event->context = nullptr; + } + + delete event; +} catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_backend_sycl_event_record(ggml_backend_event_t event) try { + if (event == nullptr || event->context == nullptr) { + return; + } + + ggml_backend_sycl_context *sycl_ctx = + (ggml_backend_sycl_context *)event->backend->context; + sycl::event *sycl_event = static_cast(event->context); + + const queue_ptr &stream = sycl_ctx->stream(sycl_ctx->device, 0); + // Record the current state of the queue + *sycl_event = stream->ext_oneapi_submit_barrier(); +} catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_backend_sycl_event_synchronize(ggml_backend_event_t event) { + if (event == nullptr || event->context == nullptr) { + return; + } + + sycl::event *sycl_event = static_cast(event->context); + SYCL_CHECK(CHECK_TRY_ERROR(sycl_event->wait())); +} + static ggml_backend_i ggml_backend_sycl_interface = { /* .get_name = */ ggml_backend_sycl_name, /* .free = */ ggml_backend_sycl_free, /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type, /* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async, /* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async, - /* .cpy_tensor_async = */ NULL, //ggml_backend_sycl_cpy_tensor_async, // TODO: update for the new interface + /* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async, + // // TODO: update for the new + // interface /* .synchronize = */ ggml_backend_sycl_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, @@ -5197,11 +5257,11 @@ static ggml_backend_i ggml_backend_sycl_interface = { /* .supports_op = */ ggml_backend_sycl_supports_op, /* .supports_buft = */ ggml_backend_sycl_supports_buft, /* .offload_op = */ ggml_backend_sycl_offload_op, - /* .event_new = */ NULL, - /* .event_free = */ NULL, - /* .event_record = */ NULL, + /* .event_new = */ ggml_backend_sycl_event_new, + /* .event_free = */ ggml_backend_sycl_event_free, + /* .event_record = */ ggml_backend_sycl_event_record, /* .event_wait = */ NULL, - /* .event_synchronize = */ NULL, + /* .event_synchronize = */ ggml_backend_sycl_event_synchronize, }; static ggml_guid_t ggml_backend_sycl_guid() { diff --git a/src/llama.cpp b/src/llama.cpp index 7775a8d183c165..de93beb0d10249 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5046,9 +5046,7 @@ struct llama_model_loader { } } } -#endif - -#if defined(GGML_USE_SYCL) +#elif defined(GGML_USE_SYCL) // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. // NVMe raid configurations might require more / larger buffers. constexpr size_t n_buffers = 4; @@ -5056,7 +5054,7 @@ struct llama_model_loader { std::vector host_buffers; std::vector host_ptrs; - // std::vector events; + std::vector events; size_t buffer_idx = 0; // buffer to use for async loads ggml_backend_t sycl_backend = nullptr; @@ -5080,6 +5078,7 @@ struct llama_model_loader { for (size_t idx = 0; idx < n_buffers; ++idx) { host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size)); host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx])); + events.emplace_back(ggml_backend_event_new(sycl_backend)); } } } @@ -5161,24 +5160,29 @@ struct llama_model_loader { } else -#endif -#if defined(GGML_USE_SYCL) - // If sycl_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. - if (sycl_backend) { - file->seek(weight->offs, SEEK_SET); - - size_t bytes_read = 0; +#elif defined(GGML_USE_SYCL) + // If sycl_backend is valid load the tensor in chunks to + // pinned memory and upload the buffers asynchronously to the + // GPU. + if (sycl_backend) { + file->seek(weight->offs, SEEK_SET); - while (bytes_read < n_size) { - size_t read_iteration = std::min(buffer_size, n_size - bytes_read); - file->read_raw(host_ptrs[buffer_idx], read_iteration); - ggml_backend_tensor_set_async(sycl_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); - bytes_read += read_iteration; - ++buffer_idx; - buffer_idx %= n_buffers; - } + size_t bytes_read = 0; + + while (bytes_read < n_size) { + size_t read_iteration = + std::min(buffer_size, n_size - bytes_read); + ggml_backend_event_synchronize(events[buffer_idx]); + file->read_raw(host_ptrs[buffer_idx], read_iteration); + ggml_backend_tensor_set_async(sycl_backend, cur, + host_ptrs[buffer_idx], + bytes_read, read_iteration); + ggml_backend_event_record(events[buffer_idx]); + bytes_read += read_iteration; + ++buffer_idx; + buffer_idx %= n_buffers; } - else + } else #endif { read_buf.resize(n_size); @@ -5205,15 +5209,13 @@ struct llama_model_loader { } ggml_backend_free(cuda_backend); } -#endif - -#if defined(GGML_USE_SYCL) +#elif defined(GGML_USE_SYCL) // free temporary resources used for async cuda uploads if (sycl_backend) { for (size_t idx = 0; idx < n_buffers;++idx) { - // ggml_backend_event_synchronize(events[idx]); - // ggml_backend_event_free(events[idx]); - ggml_backend_buffer_free(host_buffers[idx]); + ggml_backend_event_synchronize(events[idx]); + ggml_backend_event_free(events[idx]); + ggml_backend_buffer_free(host_buffers[idx]); } ggml_backend_free(sycl_backend); }