Skip to content

Commit

Permalink
optinize and clean code
Browse files Browse the repository at this point in the history
  • Loading branch information
haricot committed Jan 21, 2025
1 parent 49703f9 commit 070878f
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 4 deletions.
2 changes: 1 addition & 1 deletion candle-kernels/src/fill.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ COPY2D_OP(double, copy2d_f64)
COPY2D_OP(uint8_t, copy2d_u8)
COPY2D_OP(uint32_t, copy2d_u32)
COPY2D_OP(int64_t, copy2d_i64)

#if __CUDA_ARCH__ >= 530
#include <cuda_bf16.h>
extern "C" __global__ void fill_f16(__half *buf, __half value, const size_t numel) { fill_with(buf, value, numel); }
Expand Down
4 changes: 1 addition & 3 deletions candle-kernels/src/reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -579,9 +579,7 @@ ROPE_OP(__nv_bfloat16, rope_bf16, rope_i_bf16, rope_thd_bf16)
FAST_OP(__nv_bfloat16, fast_min_bf16, fast_max_bf16, fast_argmin_bf16, fast_argmax_bf16, fast_sum_bf16)
#endif

#if __CUDA_ARCH__ >= 800
SUM_OP(__nv_bfloat16, sum_bf16)
#elif __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ >= 750
#if __CUDA_ARCH__ >= 750
SUM_OP(__nv_bfloat16, sum_bf16)
#elif __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ < 750
//The automatic fallback mechanism for these architectures:
Expand Down

0 comments on commit 070878f

Please sign in to comment.