relpath

Signed-off-by: Mayank Mishra <[email protected]>
mayank31398 · Dec 19, 2024 · d9dc48a · d9dc48a
1 parent 7f468d6
commit d9dc48a
Show file tree

Hide file tree

Showing 5 changed files with 11 additions and 11 deletions.
diff --git a/cute_kernels/kernels/add/add_scalar/cuda_implementation/kernels_forward.cu b/cute_kernels/kernels/add/add_scalar/cuda_implementation/kernels_forward.cu
@@ -10,7 +10,7 @@ template <typename scalar_t, typename vector_t>
 __global__ void _add_scalar_forward_cuda_kernel(const scalar_t *x,
                                                 const fp32 y,
                                                 scalar_t *output,
-                                                const int64_t num_elements) {
+                                                const uint64 num_elements) {
     constexpr int vector_instruction_width = sizeof(vector_t) / sizeof(scalar_t);
     static_assert(vector_instruction_width == 1 || vector_instruction_width == 2 || vector_instruction_width == 4 ||
                   vector_instruction_width == 8);
@@ -100,7 +100,7 @@ void add_scalar_forward_cuda(const torch::Tensor &x,
                              const int &BLOCK_SIZE) {
     assert(BLOCK_SIZE % WARP_SIZE == 0);
 
-    const int64_t num_elements = x.numel();
+    const uint64 num_elements = x.numel();
 
     const int num_elements_per_block = BLOCK_SIZE * vector_instruction_width;
     const int NUM_BLOCKS = (num_elements + num_elements_per_block - 1) / num_elements_per_block;

diff --git a/cute_kernels/kernels/add/add_tensor/cuda_implementation/kernels_forward.cu b/cute_kernels/kernels/add/add_tensor/cuda_implementation/kernels_forward.cu
@@ -10,7 +10,7 @@ template <typename scalar_t, typename vector_t>
 __global__ void _add_tensor_forward_cuda_kernel(const scalar_t *x,
                                                 const scalar_t *y,
                                                 scalar_t *output,
-                                                const int64_t num_elements) {
+                                                const uint64 num_elements) {
     constexpr int vector_instruction_width = sizeof(vector_t) / sizeof(scalar_t);
     static_assert(vector_instruction_width == 1 || vector_instruction_width == 2 || vector_instruction_width == 4 ||
                   vector_instruction_width == 8);
@@ -103,7 +103,7 @@ void add_tensor_forward_cuda(const torch::Tensor &x,
                              const int &BLOCK_SIZE) {
     assert(BLOCK_SIZE % WARP_SIZE == 0);
 
-    const int64_t num_elements = x.numel();
+    const uint64 num_elements = x.numel();
 
     const int num_elements_per_block = BLOCK_SIZE * vector_instruction_width;
     const int NUM_BLOCKS = (num_elements + num_elements_per_block - 1) / num_elements_per_block;

diff --git a/cute_kernels/kernels/embedding/cuda_implementation/kernels_forward.cu b/cute_kernels/kernels/embedding/cuda_implementation/kernels_forward.cu
@@ -11,11 +11,11 @@ template <typename scalar_t, typename vector_t>
 __global__ void _embedding_forward_cuda_kernel(const scalar_t *input_ids,
                                                const scalar_t *weight,
                                                scalar_t *output,
-                                               const int64_t &num_elements,
+                                               const uint64 &num_elements,
                                                const int &BLOCK_SIZE_B,
                                                const int &BLOCK_SIZE_H) {
     constexpr int vector_instruction_width = sizeof(vector_t) / sizeof(scalar_t);
-    const int64_t thread_id = get_global_thread_id();
+    const uint64 thread_id = get_global_thread_id();
 
     using dtype = DType<scalar_t>;
 }
@@ -25,7 +25,7 @@ void embedding_forward_cuda(const torch::Tensor &input_ids,
                             torch::Tensor output,
                             const int &BLOCK_SIZE_B,
                             const int &BLOCK_SIZE_H) {
-    const int64_t num_elements = gate.numel();
+    const uint64 num_elements = gate.numel();
 
     AT_DISPATCH_CUSTOM_FLOAT_TYPES(gate.scalar_type(), "embedding_forward_cuda_kernel", ([&] {
                                        const int num_elements_per_block = BLOCK_SIZE * vector_instruction_width;

diff --git a/cute_kernels/kernels/swiglu/cuda_implementation/kernels_backward.cu b/cute_kernels/kernels/swiglu/cuda_implementation/kernels_backward.cu
@@ -31,7 +31,7 @@ __global__ void _swiglu_backward_cuda_kernel(const scalar_t *gate,
                                              const scalar_t *output_grad,
                                              scalar_t *gate_grad,
                                              scalar_t *up_grad,
-                                             const int64_t num_elements) {
+                                             const uint64 num_elements) {
     constexpr int vector_instruction_width = sizeof(vector_t) / sizeof(scalar_t);
     static_assert(vector_instruction_width == 1 || vector_instruction_width == 2 || vector_instruction_width == 4 ||
                   vector_instruction_width == 8);
@@ -167,7 +167,7 @@ void swiglu_backward_cuda(const torch::Tensor &gate,
                           torch::Tensor &up_grad,
                           const int &vector_instruction_width,
                           const int &BLOCK_SIZE) {
-    const int64_t num_elements = gate.numel();
+    const uint64 num_elements = gate.numel();
 
     AT_DISPATCH_CUSTOM_FLOAT_TYPES(
         gate.scalar_type(), "swiglu_backward_cuda_kernel", ([&] {

diff --git a/cute_kernels/kernels/swiglu/cuda_implementation/kernels_forward.cu b/cute_kernels/kernels/swiglu/cuda_implementation/kernels_forward.cu
@@ -11,7 +11,7 @@ template <typename scalar_t, typename vector_t>
 __global__ void _swiglu_forward_cuda_kernel(const scalar_t *gate,
                                             const scalar_t *up,
                                             scalar_t *output,
-                                            const int64_t num_elements) {
+                                            const uint64 num_elements) {
     constexpr int vector_instruction_width = sizeof(vector_t) / sizeof(scalar_t);
     static_assert(vector_instruction_width == 1 || vector_instruction_width == 2 || vector_instruction_width == 4 ||
                   vector_instruction_width == 8);
@@ -119,7 +119,7 @@ void swiglu_forward_cuda(const torch::Tensor &gate,
                          torch::Tensor &output,
                          const int &vector_instruction_width,
                          const int &BLOCK_SIZE) {
-    const int64_t num_elements = gate.numel();
+    const uint64 num_elements = gate.numel();
 
     AT_DISPATCH_CUSTOM_FLOAT_TYPES(
         gate.scalar_type(), "swiglu_forward_cuda_kernel", ([&] {