MoE Marlin: support desc_act for groupsize != -1 (#2590)

This change uses the updated Marlin MoE kernel from vLLM to support MoE with activation sorting and groups.
huggingface · Sep 30, 2024 · 1c84a30 · 1c84a30
1 parent d1f257a
commit 1c84a30
Show file tree

Hide file tree

Showing 5 changed files with 6 additions and 19 deletions.
diff --git a/flake.lock b/flake.lock
diff --git a/flake.nix b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:danieldk/tgi-nix";
+    tgi-nix.url = "github:danieldk/tgi-nix/moe-kernels-0.5.0";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {

diff --git a/server/text_generation_server/layers/marlin/gptq.py b/server/text_generation_server/layers/marlin/gptq.py
@@ -109,7 +109,6 @@ def get_weights_col_packed(
         prefix: str,
         block_sizes: Union[int, List[int]],
     ):
-
         try:
             qweight = weights.get_packed_sharded(
                 f"{prefix}.qweight", dim=1, block_sizes=block_sizes
@@ -352,7 +351,7 @@ def repack_gptq_for_marlin(
 
     scales = permute_scales(scales)
 
-    is_full_k = not (desc_act and sharded_infeatures)
+    is_full_k = not (desc_act and groupsize != -1 and sharded_infeatures)
 
     return GPTQMarlinWeight(
         qweight=repacked,

diff --git a/server/text_generation_server/layers/moe/__init__.py b/server/text_generation_server/layers/moe/__init__.py
@@ -249,12 +249,9 @@ def is_supported(weights: Weights) -> bool:
             or (
                 isinstance(weights.loader, GPTQMarlinWeightsLoader)
                 and can_use_marlin_moe_gemm(
-                    desc_act=weights.loader.desc_act,
-                    groupsize=weights.loader.groupsize,
                     quant_method=weights.loader.quant_method,
                     quantize=weights.loader.quantize,
                     sym=weights.loader.sym,
-                    use_tp=weights.process_group.size() > 1,
                 )
             )
         )
diff --git a/server/text_generation_server/layers/moe/gptq_marlin.py b/server/text_generation_server/layers/moe/gptq_marlin.py
@@ -26,12 +26,9 @@
 
 def can_use_marlin_moe_gemm(
     *,
-    desc_act: bool,
-    groupsize: int,
     quant_method: str,
     quantize: str,
     sym: bool,
-    use_tp: bool,
 ):
     return (
         SYSTEM == "cuda"
@@ -40,16 +37,9 @@ def can_use_marlin_moe_gemm(
         and quantize == "gptq"
         and quant_method == "gptq"
         and sym
-        and is_full_k(desc_act, groupsize, use_tp)
     )
 
 
-def is_full_k(desc_act: bool, groupsize: int, use_tp: bool):
-    if groupsize == -1:
-        return True
-    return not (desc_act and use_tp)
-
-
 @dataclass
 class GPTQMarlinMoEWeight:
     qweight: torch.Tensor