NVIDIA
diff --git a/‎tensorrt_llm/_torch/auto_deploy/config/default.yaml‎
Lines changed: 3 additions & 5 deletions b/‎tensorrt_llm/_torch/auto_deploy/config/default.yaml‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/llm_args.py‎
Lines changed: 11 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/llm_args.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/models/factory.py‎
Lines changed: 14 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/models/factory.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/models/hf.py‎
Lines changed: 34 additions & 1 deletion b/‎tensorrt_llm/_torch/auto_deploy/models/hf.py‎
Lines changed: 34 additions & 1 deletion
@@ -52,13 +52,11 @@ transforms:
   quantize_moe:
     stage: pattern_matcher
   # TODO: Infer sharding parameters (tp_size, row/column sharding) from the model config.
-  detect_column_row_shard:
+  detect_sharding:
     stage: sharding
     simple_shard_only: false
-  detect_ep_shard:
-    stage: sharding
-  detect_dp_bmm_shard:
-    stage: sharding
+    use_sharding_from_factory: false
+    sharding_dims: ['tp', 'ep', 'dp']
   # TODO: (hg) need to ensure run_shape_prop after sharding.
   sharding_transform_executor:
     stage: sharding
 
@@ -159,6 +159,17 @@ class AutoDeployConfig(DynamicYamlMixInForSettings, BaseSettings):
         "If False, auto-detect and use column+row (all_reduce) sharding when possible.",
     )
 
+    use_sharding_from_factory: bool = Field(
+        default=False,
+        description="If True, use sharding from the model factory. If False, use sharding from the "
+        "AutoDeployConfig.",
+    )
+
+    sharding_dims: List[str] = Field(
+        default=["tp", "ep", "dp"],
+        description="The sharding methods to apply by the heuristic sharding stage.",
+    )
+
     compile_backend: Literal["torch-simple", "torch-compile", "torch-cudagraph", "torch-opt"] = (
         Field(
             default="torch-compile",
 
@@ -2,6 +2,7 @@
 
 import copy
 from abc import ABC, abstractmethod
+from enum import Enum
 from typing import Any, Callable, Dict, Optional, Type
 
 import torch
@@ -12,6 +13,13 @@
 from ..utils.logger import ad_logger
 
 
+class ShardingConfigSource(Enum):
+    """Enum for factory source."""
+
+    HUGGINGFACE = "huggingface"
+    UNKNOWN = "unknown"
+
+
 class ModelFactory(ABC):
     """An interface to return and correctly initialize a model from a desired source.
 
@@ -38,6 +46,8 @@ def __init__(
         self.max_seq_len = max_seq_len
         self._prefetched_model_path: Optional[str] = None
         self._prefetched_tokenizer_path: Optional[str] = None
+        self._sharding_config: Dict[str, Any] = {}
+        self._sharding_config["source"] = ShardingConfigSource.UNKNOWN
 
     @property
     def model(self) -> Optional[str]:
@@ -96,6 +106,10 @@ def get_quant_config(self) -> Dict:
         """Returns the quantization config for this model or None if not quantized."""
         return {}
 
+    def get_sharding_config(self) -> Dict:
+        """Returns the sharding config for this model."""
+        return self._sharding_config
+
     def get_cache_config(self) -> CacheConfig:
         """Return the cache configuration for the model.
 
 
@@ -29,7 +29,7 @@
 from ..custom_ops.attention_interface import CacheConfig
 from ..utils._config import deep_merge_dicts
 from ..utils.logger import ad_logger
-from .factory import ModelFactory, ModelFactoryRegistry
+from .factory import ModelFactory, ModelFactoryRegistry, ShardingConfigSource
 from .quant_config_reader import QuantConfigReader, QuantConfigReaderRegistry
 
 
@@ -94,6 +94,9 @@ def __init__(self, *args, **kwargs):
             assert isinstance(dtype, torch.dtype), f"Invalid dtype: {dtype}"
             self.model_kwargs["torch_dtype"] = dtype
 
+        # set sharding config source to huggingface
+        self._sharding_config["source"] = ShardingConfigSource.HUGGINGFACE
+
     @property
     def autoconfig_from_pretrained(self):
         return AutoConfig.from_pretrained
@@ -161,13 +164,30 @@ def _build_model(self, device: DeviceLikeType) -> nn.Module:
         if hasattr(model, "post_init"):
             model.post_init()
 
+        # if present, initialize sharding config. We need head_dim for colwise sharding.
+        self._set_sharding_config(model.config)
+
         # patch forward method
         model.forward = types.MethodType(self._simple_forward, model)
 
         model.eval()
 
         return model
 
+    def _set_sharding_config(self, model_config: PretrainedConfig):
+        """Set the sharding config for the model."""
+        self._sharding_config["head_dim"] = 1
+        if hasattr(model_config, "base_model_tp_plan"):
+            self._sharding_config["tp_plan"] = model_config.base_model_tp_plan
+        if hasattr(model_config, "head_dim") and model_config.head_dim is not None:
+            self._sharding_config["head_dim"] = model_config.head_dim
+        elif hasattr(model_config, "hidden_size") and hasattr(model_config, "num_attention_heads"):
+            self._sharding_config["head_dim"] = (
+                model_config.hidden_size // model_config.num_attention_heads
+            )
+        if hasattr(model_config, "num_hidden_layers"):
+            self._sharding_config["num_hidden_layers"] = model_config.num_hidden_layers
+
     def get_quant_config(self) -> Dict:
         """Returns the quantization config for this model or an empty dict if not quantized."""
         if self._quant_config_reader is not None:
@@ -339,6 +359,19 @@ class AutoModelForImageTextToTextFactory(AutoModelForCausalLMFactory):
         },
     }
 
+    def _set_sharding_config(self, model_config: PretrainedConfig):
+        """Override the sharding config for the model with text_config."""
+        super()._set_sharding_config(model_config)
+
+        if hasattr(model_config, "text_config"):
+            text_config = model_config.text_config
+            if hasattr(text_config, "base_model_tp_plan"):
+                self._sharding_config["tp_plan"] = text_config.base_model_tp_plan
+            if hasattr(text_config, "head_dim"):
+                self._sharding_config["head_dim"] = text_config.head_dim
+            if hasattr(text_config, "num_hidden_layers"):
+                self._sharding_config["num_hidden_layers"] = text_config.num_hidden_layers
+
     @property
     def automodel_from_config(self):
         return AutoModelForImageTextToText.from_config