implemented missing SYCL event APIs

OuadiElfarouki · Sep 30, 2024 · 520fe5a · 520fe5a
1 parent db150de
commit 520fe5a
Show file tree

Hide file tree

Showing 2 changed files with 95 additions and 33 deletions.
diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
@@ -4927,8 +4927,8 @@ GGML_CALL static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
 
     GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
     const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
-    SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
-        (char *)tensor->data + offset, data, size).wait()));
+    SYCL_CHECK(CHECK_TRY_ERROR(
+        (stream)->memcpy((char *)tensor->data + offset, data, size)));
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -5181,13 +5181,73 @@ GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, gg
     return buft_ctx->device == sycl_ctx->device;
 }
 
+static ggml_backend_event_t
+ggml_backend_sycl_event_new(ggml_backend_t backend) {
+  ggml_backend_sycl_context *sycl_ctx =
+      (ggml_backend_sycl_context *)backend->context;
+
+  sycl::event *event_ptr = new sycl::event();
+
+  return new ggml_backend_event{
+      /* .backend = */ backend,
+      /* .context = */ event_ptr,
+  };
+}
+
+static void ggml_backend_sycl_event_free(ggml_backend_event_t event) try {
+  if (event == nullptr) {
+    return;
+  }
+
+  if (event->context != nullptr) {
+    sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
+    delete sycl_event;
+    event->context = nullptr;
+  }
+
+  delete event;
+} catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_sycl_event_record(ggml_backend_event_t event) try {
+  if (event == nullptr || event->context == nullptr) {
+    return;
+  }
+
+  ggml_backend_sycl_context *sycl_ctx =
+      (ggml_backend_sycl_context *)event->backend->context;
+  sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
+
+  const queue_ptr &stream = sycl_ctx->stream(sycl_ctx->device, 0);
+  // Record the current state of the queue
+  *sycl_event = stream->ext_oneapi_submit_barrier();
+} catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_sycl_event_synchronize(ggml_backend_event_t event) {
+  if (event == nullptr || event->context == nullptr) {
+    return;
+  }
+
+  sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
+  SYCL_CHECK(CHECK_TRY_ERROR(sycl_event->wait()));
+}
+
 static ggml_backend_i ggml_backend_sycl_interface = {
     /* .get_name                = */ ggml_backend_sycl_name,
     /* .free                    = */ ggml_backend_sycl_free,
     /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
     /* .set_tensor_async        = */ ggml_backend_sycl_set_tensor_async,
     /* .get_tensor_async        = */ ggml_backend_sycl_get_tensor_async,
-    /* .cpy_tensor_async        = */ NULL, //ggml_backend_sycl_cpy_tensor_async, // TODO: update for the new interface
+    /* .cpy_tensor_async        = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
+                                           // // TODO: update for the new
+                                           // interface
     /* .synchronize             = */ ggml_backend_sycl_synchronize,
     /* .graph_plan_create       = */ NULL,
     /* .graph_plan_free         = */ NULL,
@@ -5197,11 +5257,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
     /* .supports_op             = */ ggml_backend_sycl_supports_op,
     /* .supports_buft           = */ ggml_backend_sycl_supports_buft,
     /* .offload_op              = */ ggml_backend_sycl_offload_op,
-    /* .event_new               = */ NULL,
-    /* .event_free              = */ NULL,
-    /* .event_record            = */ NULL,
+    /* .event_new               = */ ggml_backend_sycl_event_new,
+    /* .event_free              = */ ggml_backend_sycl_event_free,
+    /* .event_record            = */ ggml_backend_sycl_event_record,
     /* .event_wait              = */ NULL,
-    /* .event_synchronize       = */ NULL,
+    /* .event_synchronize       = */ ggml_backend_sycl_event_synchronize,
 };
 
 static ggml_guid_t ggml_backend_sycl_guid() {

diff --git a/src/llama.cpp b/src/llama.cpp
@@ -5046,17 +5046,15 @@ struct llama_model_loader {
                 }
             }
         }
-#endif
-
-#if defined(GGML_USE_SYCL)
+#elif defined(GGML_USE_SYCL)
         // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
         // NVMe raid configurations might require more / larger buffers.
         constexpr size_t n_buffers = 4;
         constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
 
         std::vector<ggml_backend_buffer_t> host_buffers;
         std::vector<void*> host_ptrs;
-        // std::vector<ggml_backend_event_t> events;
+        std::vector<ggml_backend_event_t> events;
         size_t buffer_idx = 0; // buffer to use for async loads
 
         ggml_backend_t sycl_backend = nullptr;
@@ -5080,6 +5078,7 @@ struct llama_model_loader {
                 for (size_t idx = 0; idx < n_buffers; ++idx) {
                     host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
                     host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
+                    events.emplace_back(ggml_backend_event_new(sycl_backend));
                 }
             }
         }
@@ -5161,24 +5160,29 @@ struct llama_model_loader {
 
                     }
                     else
-#endif
-#if defined(GGML_USE_SYCL)
-                    // If sycl_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
-                    if (sycl_backend) {
-                        file->seek(weight->offs, SEEK_SET);
-
-                        size_t bytes_read = 0;
+#elif defined(GGML_USE_SYCL)
+                  // If sycl_backend is valid load the tensor in chunks to
+                  // pinned memory and upload the buffers asynchronously to the
+                  // GPU.
+                  if (sycl_backend) {
+                    file->seek(weight->offs, SEEK_SET);
 
-                        while (bytes_read < n_size) {
-                            size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
-                            file->read_raw(host_ptrs[buffer_idx], read_iteration);
-                            ggml_backend_tensor_set_async(sycl_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
-                            bytes_read += read_iteration;
-                            ++buffer_idx;
-                            buffer_idx %= n_buffers;
-                        }
+                    size_t bytes_read = 0;
+
+                    while (bytes_read < n_size) {
+                      size_t read_iteration =
+                          std::min<size_t>(buffer_size, n_size - bytes_read);
+                      ggml_backend_event_synchronize(events[buffer_idx]);
+                      file->read_raw(host_ptrs[buffer_idx], read_iteration);
+                      ggml_backend_tensor_set_async(sycl_backend, cur,
+                                                    host_ptrs[buffer_idx],
+                                                    bytes_read, read_iteration);
+                      ggml_backend_event_record(events[buffer_idx]);
+                      bytes_read += read_iteration;
+                      ++buffer_idx;
+                      buffer_idx %= n_buffers;
                     }
-                    else
+                  } else
 #endif
                     {
                         read_buf.resize(n_size);
@@ -5205,15 +5209,13 @@ struct llama_model_loader {
             }
             ggml_backend_free(cuda_backend);
         }
-#endif
-
-#if defined(GGML_USE_SYCL) 
+#elif defined(GGML_USE_SYCL)
         // free temporary resources used for async cuda uploads
         if (sycl_backend) {
             for (size_t idx = 0; idx < n_buffers;++idx) {
-                // ggml_backend_event_synchronize(events[idx]);
-                // ggml_backend_event_free(events[idx]);
-                ggml_backend_buffer_free(host_buffers[idx]);
+              ggml_backend_event_synchronize(events[idx]);
+              ggml_backend_event_free(events[idx]);
+              ggml_backend_buffer_free(host_buffers[idx]);
             }
             ggml_backend_free(sycl_backend);
         }