NVIDIA
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py‎
Lines changed: 86 additions & 1 deletion b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py‎
Lines changed: 86 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/auto_deploy/transformations/library/fusion.py‎
Lines changed: 13 additions & 7 deletions b/‎tensorrt_llm/_torch/auto_deploy/transformations/library/fusion.py‎
Lines changed: 13 additions & 7 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/transformations/library/quantization.py‎
Lines changed: 116 additions & 12 deletions b/‎tensorrt_llm/_torch/auto_deploy/transformations/library/quantization.py‎
Lines changed: 116 additions & 12 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/utils/node_utils.py‎
Lines changed: 12 additions & 2 deletions b/‎tensorrt_llm/_torch/auto_deploy/utils/node_utils.py‎
Lines changed: 12 additions & 2 deletions
@@ -1,8 +1,10 @@
 """Definition of the quant module that can be used for PTQ."""
 
+import warnings
 from typing import Optional
 
 import torch
+from flashinfer import bmm_fp8
 from torch import nn
 
 from tensorrt_llm._torch.autotuner import autotune
@@ -222,7 +224,90 @@ def fp4_linear_fake(
     return torch.ops.aten.linear(input, weight_fp4.repeat(1, 2).to(input.dtype), bias)
 
 
-QUANT_OPS = [
+def is_column_major(tensor):
+    rows, _ = tensor.shape[-2:]
+    strides = tensor.stride()
+    return strides[-2] == 1 and strides[-1] == rows
+
+
+@torch.library.custom_op("auto_deploy::torch_quant_fp8_bmm", mutates_args=())
+def fp8_bmm(
+    input: torch.Tensor,
+    mat2: torch.Tensor,
+    input_scale: torch.Tensor,
+    weight_scale: torch.Tensor,
+) -> torch.Tensor:
+    """FP8 BMM op similar to torch.bmm.
+
+    Args:
+        input: unquantized input tensor with shape (B, M, K)
+        mat2: weight tensor with shape (B, K, N), with dtype torch.float8_e4m3fn,
+            or torch.float16, or torch.bfloat16
+        input_scale: a scalar tensor - the inverse scale for input quantization
+        weight_scale: a scalar tensor - the inverse scale for weight quantization
+
+    Returns:
+        The BMM output with shape (B, M, N) and the original dtype as the input.
+    """
+    # Ensure input is contiguous
+    input = input.contiguous()
+    original_input_dtype = input.dtype
+
+    # Convert input to fp8 using provided scale
+    if input.dtype in [torch.float16, torch.bfloat16]:
+        input_fp8 = _to_fp8(input, input_scale)
+    else:
+        assert input.dtype == torch.float8_e4m3fn
+        input_fp8 = input
+
+    # Convert mat2 to fp8 using provided scale
+    if mat2.dtype in [torch.float16, torch.bfloat16]:
+        mat2_fp8 = _to_fp8(mat2, weight_scale)
+    else:
+        assert mat2.dtype == torch.float8_e4m3fn
+        mat2_fp8 = mat2
+
+    # Ensure mat2 is contiguous in column-major format only if needed
+    # Check if the tensor is already contiguous when transposed (i.e., already column-major)
+    if not is_column_major(mat2_fp8):
+        warnings.warn(
+            "mat2 is not in column-major format, transposing it, this will cause performance degradation."
+        )
+        mat2_fp8 = mat2_fp8.transpose(-2, -1).contiguous().transpose(-2, -1)
+
+    # Get dimensions
+    B, M, K = input.shape
+    B2, K2, N = mat2_fp8.shape
+    assert B == B2, f"Batch dimensions must match: {B} vs {B2}"
+    assert K == K2, f"Inner dimensions must match: {K} vs {K2}"
+
+    output = torch.empty((B, M, N), dtype=original_input_dtype, device=input.device)
+    bmm_fp8(
+        input_fp8, mat2_fp8, input_scale.float(), weight_scale.float(), original_input_dtype, output
+    )
+
+    return output
+
+
+@fp8_bmm.register_fake
+def fp8_bmm_fake(
+    input: torch.Tensor,
+    mat2: torch.Tensor,
+    input_scale: torch.Tensor,
+    weight_scale: torch.Tensor,
+) -> torch.Tensor:
+    """Fake implementation of fp8_bmm for testing and tracing."""
+    # Use standard bmm
+    return torch.bmm(input.to(torch.float), mat2.to(torch.float)).to(input.dtype)
+
+
+QUANT_LINEAR_OPS = [
     torch.ops.auto_deploy.torch_quant_fp8_linear,
     torch.ops.auto_deploy.torch_quant_fp4_linear,
 ]
+
+QUANT_BMM_OPS = [
+    torch.ops.auto_deploy.torch_quant_fp8_bmm,
+]
+
+QUANT_OPS = QUANT_LINEAR_OPS + QUANT_BMM_OPS
@@ -41,7 +41,7 @@ def _insert_fused_gemm(gm: GraphModule, idx: int, parent_node: Node, linear_node
     sizes_unfused = [p.size(0) for p in params_unfused]
     key_fused = f"fused_weight_{idx}"
 
-    quantization_impl = QuantizationImpl.create(linear_nodes[0])
+    quantization_impls = [QuantizationImpl.create(n) for n in linear_nodes]
 
     def fuse_weights(tensors: List[torch.Tensor]) -> torch.Tensor:
         """Fuse weights of linear nodes."""
@@ -51,17 +51,20 @@ def split_output(tensor: torch.Tensor) -> Tuple[torch.Tensor, ...]:
         """Split the output tensor of the fused linear node to obtain the original outputs."""
         return tuple(t.contiguous() for t in torch.split(tensor, sizes_unfused, dim=-1))
 
-    if quantization_impl:
+    if all(
+        q is not None and quantization_impls[0].target_op() == q.target_op()
+        for q in quantization_impls
+    ):
         scales = {}
         for weight_key in keys_unfused:
             key = weight_key.rsplit(".", 1)[0]
 
-            for scale_name in quantization_impl.scale_names():
+            for scale_name in quantization_impls[0].scale_names():
                 buffer_name = key + "." + scale_name
                 scales.setdefault(scale_name, []).append(gm.get_buffer(buffer_name))
 
         try:
-            weights_fused, buffer_fused = quantization_impl.fuse_linear_weights(
+            weights_fused, buffer_fused = quantization_impls[0].fuse_linear_weights(
                 params_unfused, **scales
             )
         except NotImplementedError as e:
@@ -73,8 +76,11 @@ def split_output(tensor: torch.Tensor) -> Tuple[torch.Tensor, ...]:
             fused_buffer_name = key_fused + "_" + scale_name
             gm.register_buffer(fused_buffer_name, buffer)
 
-    else:
+    elif all(q is None for q in quantization_impls):
         param_fused = nn.Parameter(fuse_weights([gm.get_parameter(k) for k in keys_unfused]))
+    else:
+        ad_logger.warning(f"Cannot fuse ops {keys_unfused} for mixed-precision linear nodes.")
+        return
 
     setattr(gm, key_fused, param_fused)
 
@@ -83,8 +89,8 @@ def split_output(tensor: torch.Tensor) -> Tuple[torch.Tensor, ...]:
 
     with gm.graph.inserting_before(linear_nodes[0]):
         get_param_node = gm.graph.get_attr(key_fused, torch.Tensor)
-        if quantization_impl:
-            for scale_name in quantization_impl.scale_names():
+        if quantization_impls[0]:
+            for scale_name in quantization_impls[0].scale_names():
                 # Creates new nodes for the fused scales so the unfused linear ops can be fully erased.
                 fused_kwargs[scale_name] = gm.graph.create_node(
                     "get_attr", key_fused + "_" + scale_name
 
@@ -9,6 +9,7 @@
 from ...utils.node_utils import (
     extract_param_names_from_lin_node,
     get_quantization_params_from_linear_node,
+    is_bmm_op,
     is_linear_op,
     is_match,
 )
@@ -81,8 +82,95 @@ def _insert_quantized_linear(
     node.kwargs = {**node.kwargs, **scales}
 
 
+def _insert_quantized_bmm(
+    gm: GraphModule,
+    node: Node,
+    quantization_impl: QuantizationImpl,
+    is_quantized_graph: bool = False,
+):
+    """Replaces the bmm node with a new quantized bmm node."""
+    weight_node = node.args[1]
+
+    # Weight is a parameter
+    if weight_node.op == "get_attr":
+        # Handle parameter tensor
+        param_name = weight_node.target
+        original_weight = gm.get_parameter(param_name)
+        weight_shape = original_weight.shape
+
+        # Quantize the weight
+        new_param = nn.Parameter(
+            quantization_impl.quantize_weight(original_weight), requires_grad=False
+        )
+
+        # Update the parameter in the model
+        modname, _, attrname = param_name.rpartition(".")
+        submod = gm.get_submodule(modname)
+        setattr(submod, attrname, new_param)
+
+        # Register load state dict hook
+        gm._register_load_state_dict_pre_hook(
+            partial(quantization_impl.load_hook, weight_name=param_name)
+        )
+        if quantization_impl.post_load_hook:
+            gm.register_load_state_dict_post_hook(
+                partial(quantization_impl.post_load_hook, weight_name=param_name)
+            )
+
+        # Setup scale names and target module for parameter case
+        def get_scale_name(scale_name):
+            return attrname + "_" + scale_name
+
+        scale_target_module = submod
+        scale_name_prefix = f"{modname}."
+
+    # Weight is a dynamic tensor
+    elif hasattr(weight_node, "meta") and "val" in weight_node.meta:
+        weight_shape = weight_node.meta["val"].shape
+
+        # Create a unique identifier for this dynamic weight node
+        node_id = f"bmm_dynamic_{id(node)}"
+
+        # Setup scale names and target module for dynamic case
+        def get_scale_name(scale_name):
+            return f"{node_id}_{scale_name}"
+
+        scale_target_module = gm  # Register in root module
+        scale_name_prefix = ""
+
+        ad_logger.info(f"Quantized BMM with dynamic weight tensor for node {node}")
+    else:
+        # If we can't determine the shape, skip quantization
+        ad_logger.warning(
+            f"BMM weight is dynamic tensor without shape metadata, skipping quantization for node {node}"
+        )
+        return
+
+    # Common logic for both parameter and dynamic tensor cases
+    # Register scales in the target module
+    for scale_name, scale in quantization_impl.default_scales(weight_shape).items():
+        scale_buffer_name = get_scale_name(scale_name)
+        scale_target_module.register_buffer(scale_buffer_name, scale)
+
+    # Change node target to quantized bmm op
+    node.target = quantization_impl.target_op()
+
+    # Insert scale nodes
+    with gm.graph.inserting_before(node):
+        scales = {}
+        for scale_name in quantization_impl.scale_names():
+            scale_buffer_name = get_scale_name(scale_name)
+            scales[scale_name] = gm.graph.create_node(
+                "get_attr", f"{scale_name_prefix}{scale_buffer_name}"
+            )
+
+    # Update node arguments and kwargs
+    scale_values = [scales[scale_name] for scale_name in quantization_impl.scale_names()]
+    node.args = (*node.args, *scale_values)
+
+
 def quantize(gm: GraphModule, quant_config: Dict[str, Any]):
-    """Quantize the GraphModule and replace linear with quantized linear."""
+    """Quantize the GraphModule and replace linear and bmm with quantized versions."""
     # extract info from quant_config
     is_quant_graph = is_quantized_graph(gm)
     quant_algo = quant_config.get("quant_algo")
@@ -93,28 +181,44 @@ def quantize(gm: GraphModule, quant_config: Dict[str, Any]):
         ad_logger.info("No quantization to do.")
         return gm
 
-    # tracking quantized linears in the graph
-    quantized_nodes: Dict[str, int] = defaultdict(lambda: 0)
+    # tracking quantized operations in the graph
+    quantized_nodes: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
     for n in gm.graph.nodes:
         # check if we should skip this node
-        if is_match(n, skip) or not is_linear_op(n, include_quantization=False):
+        if is_match(n, skip):
             continue
 
-        # get per-layer quantization format from the node
-        quant_algo_n: str = get_quantization_from_linear_node(n) if is_quant_graph else quant_algo
-        if not quant_algo_n:
-            continue
+        # Process linear operations
+        if is_linear_op(n, include_quantization=False):
+            # get per-layer quantization format from the node
+            quant_algo_n: str = (
+                get_quantization_from_linear_node(n) if is_quant_graph else quant_algo
+            )
+            if not quant_algo_n:
+                continue
+
+            # insert quantized linear node
+            _insert_quantized_linear(gm, n, QuantizationImpl.create(quant_algo_n), is_quant_graph)
+            quantized_nodes[quant_algo_n]["linear"] += 1
 
-        # insert quantized linear node
-        _insert_quantized_linear(gm, n, QuantizationImpl.create(quant_algo_n), is_quant_graph)
-        quantized_nodes[quant_algo_n] += 1
+        # Process BMM operations
+        elif is_bmm_op(n):
+            if not quant_algo:
+                continue
+
+            # insert quantized bmm node
+            _insert_quantized_bmm(
+                gm, n, QuantizationImpl.create(quant_algo, is_bmm=True), is_quant_graph
+            )
+            quantized_nodes[quant_algo]["bmm"] += 1
 
     if is_quant_graph:
         remove_output_quantizers(gm)
 
     gm = canonicalize_graph(gm)
     for quant_algo in quantized_nodes:
-        ad_logger.info(f"Found {quantized_nodes[quant_algo]} {quant_algo} quantized nodes.")
+        for op_type, count in quantized_nodes[quant_algo].items():
+            ad_logger.info(f"Found {count} {quant_algo} quantized {op_type} nodes.")
     ad_logger.debug("After quantization: " + str(gm))
 
     return gm
@@ -8,7 +8,7 @@
 from torch._ops import OpOverload, OpOverloadPacket
 from torch.fx import Graph, GraphModule, Node
 
-from ..custom_ops.quant import QUANT_OPS
+from ..custom_ops.quant import QUANT_BMM_OPS, QUANT_LINEAR_OPS
 from .logger import ad_logger
 
 try:
@@ -226,10 +226,20 @@ def is_linear_op(node: Node, include_quantization: bool = False) -> bool:
     }
 
     if include_quantization:
-        lin_ops.update(QUANT_OPS)
+        lin_ops.update(QUANT_LINEAR_OPS)
     return is_op(node, lin_ops)
 
 
+def is_bmm_op(node: Node, include_quantization: bool = False) -> bool:
+    """Check if the node is a distributed op."""
+    dist_ops = {torch.ops.aten.bmm}
+
+    if include_quantization:
+        dist_ops.update(QUANT_BMM_OPS)
+
+    return is_op(node, dist_ops)
+
+
 def is_dist_op(node: Node) -> bool:
     """Check if the node is a distributed op."""
     dist_ops = {