Skip to content

Commit

Permalink
relpath
Browse files Browse the repository at this point in the history
Signed-off-by: Mayank Mishra <[email protected]>
  • Loading branch information
mayank31398 committed Dec 19, 2024
1 parent 7f468d6 commit d9dc48a
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ template <typename scalar_t, typename vector_t>
__global__ void _add_scalar_forward_cuda_kernel(const scalar_t *x,
const fp32 y,
scalar_t *output,
const int64_t num_elements) {
const uint64 num_elements) {
constexpr int vector_instruction_width = sizeof(vector_t) / sizeof(scalar_t);
static_assert(vector_instruction_width == 1 || vector_instruction_width == 2 || vector_instruction_width == 4 ||
vector_instruction_width == 8);
Expand Down Expand Up @@ -100,7 +100,7 @@ void add_scalar_forward_cuda(const torch::Tensor &x,
const int &BLOCK_SIZE) {
assert(BLOCK_SIZE % WARP_SIZE == 0);

const int64_t num_elements = x.numel();
const uint64 num_elements = x.numel();

const int num_elements_per_block = BLOCK_SIZE * vector_instruction_width;
const int NUM_BLOCKS = (num_elements + num_elements_per_block - 1) / num_elements_per_block;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ template <typename scalar_t, typename vector_t>
__global__ void _add_tensor_forward_cuda_kernel(const scalar_t *x,
const scalar_t *y,
scalar_t *output,
const int64_t num_elements) {
const uint64 num_elements) {
constexpr int vector_instruction_width = sizeof(vector_t) / sizeof(scalar_t);
static_assert(vector_instruction_width == 1 || vector_instruction_width == 2 || vector_instruction_width == 4 ||
vector_instruction_width == 8);
Expand Down Expand Up @@ -103,7 +103,7 @@ void add_tensor_forward_cuda(const torch::Tensor &x,
const int &BLOCK_SIZE) {
assert(BLOCK_SIZE % WARP_SIZE == 0);

const int64_t num_elements = x.numel();
const uint64 num_elements = x.numel();

const int num_elements_per_block = BLOCK_SIZE * vector_instruction_width;
const int NUM_BLOCKS = (num_elements + num_elements_per_block - 1) / num_elements_per_block;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ template <typename scalar_t, typename vector_t>
__global__ void _embedding_forward_cuda_kernel(const scalar_t *input_ids,
const scalar_t *weight,
scalar_t *output,
const int64_t &num_elements,
const uint64 &num_elements,
const int &BLOCK_SIZE_B,
const int &BLOCK_SIZE_H) {
constexpr int vector_instruction_width = sizeof(vector_t) / sizeof(scalar_t);
const int64_t thread_id = get_global_thread_id();
const uint64 thread_id = get_global_thread_id();

using dtype = DType<scalar_t>;
}
Expand All @@ -25,7 +25,7 @@ void embedding_forward_cuda(const torch::Tensor &input_ids,
torch::Tensor output,
const int &BLOCK_SIZE_B,
const int &BLOCK_SIZE_H) {
const int64_t num_elements = gate.numel();
const uint64 num_elements = gate.numel();

AT_DISPATCH_CUSTOM_FLOAT_TYPES(gate.scalar_type(), "embedding_forward_cuda_kernel", ([&] {
const int num_elements_per_block = BLOCK_SIZE * vector_instruction_width;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ __global__ void _swiglu_backward_cuda_kernel(const scalar_t *gate,
const scalar_t *output_grad,
scalar_t *gate_grad,
scalar_t *up_grad,
const int64_t num_elements) {
const uint64 num_elements) {
constexpr int vector_instruction_width = sizeof(vector_t) / sizeof(scalar_t);
static_assert(vector_instruction_width == 1 || vector_instruction_width == 2 || vector_instruction_width == 4 ||
vector_instruction_width == 8);
Expand Down Expand Up @@ -167,7 +167,7 @@ void swiglu_backward_cuda(const torch::Tensor &gate,
torch::Tensor &up_grad,
const int &vector_instruction_width,
const int &BLOCK_SIZE) {
const int64_t num_elements = gate.numel();
const uint64 num_elements = gate.numel();

AT_DISPATCH_CUSTOM_FLOAT_TYPES(
gate.scalar_type(), "swiglu_backward_cuda_kernel", ([&] {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ template <typename scalar_t, typename vector_t>
__global__ void _swiglu_forward_cuda_kernel(const scalar_t *gate,
const scalar_t *up,
scalar_t *output,
const int64_t num_elements) {
const uint64 num_elements) {
constexpr int vector_instruction_width = sizeof(vector_t) / sizeof(scalar_t);
static_assert(vector_instruction_width == 1 || vector_instruction_width == 2 || vector_instruction_width == 4 ||
vector_instruction_width == 8);
Expand Down Expand Up @@ -119,7 +119,7 @@ void swiglu_forward_cuda(const torch::Tensor &gate,
torch::Tensor &output,
const int &vector_instruction_width,
const int &BLOCK_SIZE) {
const int64_t num_elements = gate.numel();
const uint64 num_elements = gate.numel();

AT_DISPATCH_CUSTOM_FLOAT_TYPES(
gate.scalar_type(), "swiglu_forward_cuda_kernel", ([&] {
Expand Down

0 comments on commit d9dc48a

Please sign in to comment.