diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu index 090f95d1bda71..6dae32b25f9c4 100644 --- a/csrc/quantization/fp8/common.cu +++ b/csrc/quantization/fp8/common.cu @@ -48,7 +48,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale, const scalar_t* __restrict__ input, int64_t num_elems) { __shared__ float cache[1024]; - int i = blockDim.x * blockIdx.x + threadIdx.x; + int64_t i = blockDim.x * blockIdx.x + threadIdx.x; // First store maximum for all values processes by // the current thread in cache[threadIdx.x]