diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index a39462334e732..36bbcb76d8846 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -64,7 +64,12 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, is_row_parallel = input_size != input_size_per_partition # If group_size is -1, we are in channelwise case. - group_size = input_size if self.group_size == -1 else self.group_size + channelwise = (self.group_size == -1) + group_size = input_size if channelwise else self.group_size + row_parallel = (input_size != input_size_per_partition) + # In the case of channelwise quantization, we need to replicate the + # scales across all gpus. + partition_scales = (row_parallel and not channelwise) verify_marlin_supports_shape( output_size_per_partition=output_size_per_partition,