add implementation

Signed-off-by: Randall Smith <[email protected]>
vllm-project · Oct 30, 2024 · 4624680 · 4624680
1 parent c2cd1a2
commit 4624680
Showing 1 changed file with 9 additions and 0 deletions.
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -1,5 +1,6 @@
 import contextlib
 import functools
+import importlib
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 import torch
@@ -486,6 +487,14 @@ def cutlass_scaled_mm(a: torch.Tensor,
 
     m = a.shape[0]
     n = b.shape[1]
+
+    if current_platform.is_rocm():
+        scaled_mm_triton_module = importlib.import_module(
+            "vllm.model_executor.layers.quantization.compressed_tensors."
+            "scaled_mm_triton")
+        scaled_mm_triton = scaled_mm_triton_module.scaled_mm_triton
+        return scaled_mm_triton(a, b, scale_a, scale_b, out_dtype, bias)
+
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)
 
     torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)