Incorporating review changes - added check elem count check in kerner…

…, using for call strategy
huggingface · Oct 1, 2024 · bc60875 · bc60875
1 parent 942a617
commit bc60875
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 7 deletions.
diff --git a/candle-metal-kernels/src/fill.metal b/candle-metal-kernels/src/fill.metal
@@ -5,18 +5,23 @@ using namespace metal;
 template<typename T> METAL_FUNC void fill_with(
     device T *out,
     constant float &value,
+    constant size_t &numel,
     uint tid [[thread_position_in_grid]]
 ) {
+    if (tid >= numel) {
+        return;
+    }
     out[tid] = static_cast<T>(value);
 }
 
 #define FILL_OP(NAME, T)                                \
 kernel void fill_##NAME(                                \
     device T *out,                                      \
     constant float &value,                              \
+    constant size_t &numel,                              \
     uint tid [[thread_position_in_grid]]                \
 ) {                                                     \
-    fill_with<T>(out, value, tid);                      \
+    fill_with<T>(out, value, numel, tid);              \
 }                                                       \
 
 

diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs
@@ -2375,14 +2375,12 @@ pub fn call_const_fill(
 
     encoder.set_compute_pipeline_state(&pipeline);
 
-    set_params!(encoder, (output, v));
+    set_params!(encoder, (output, v, length));
+
+    let (thread_group_count, thread_group_size) = linear_split(&pipeline, length);
 
     encoder.use_resource(output, metal::MTLResourceUsage::Write);
-
-    let grid_size = MTLSize { width: length as u64, height: 1, depth: 1 };
-    let thread_group_size = MTLSize { width: pipeline.max_total_threads_per_threadgroup(), height: 1, depth: 1 };
-
-    encoder.dispatch_threads(grid_size, thread_group_size);
+    encoder.dispatch_thread_groups(thread_group_count, thread_group_size);
 
     Ok(())
 }