Fix Mixtral conversion (#365)

jlamypoirier · web-flow · commit 4db6271aaa9c · 2025-09-19T14:23:02.000-04:00
diff --git a/fast_llm/models/gpt/conversion/llama.py b/fast_llm/models/gpt/conversion/llama.py
@@ -354,6 +354,10 @@ class LlamaBlockConverter:
     mixer_converter_class: typing.ClassVar[type[LlamaAttentionConverter]] = LlamaAttentionConverter
     mlp_converter_class: typing.ClassVar[type[LlamaMLPConverter]] = LlamaMLPConverter
     normalization_converter_class: typing.ClassVar[type[LlamaNormalizationConverter]] = LlamaNormalizationConverter
+    hf_mixer_name: typing.ClassVar[str] = "self_attn"
+    hf_mlp_name: typing.ClassVar[str] = "mlp"
+    hf_norm_1_name: typing.ClassVar[str] = "input_layernorm"
+    hf_norm_2_name: typing.ClassVar[str] = "post_attention_layernorm"
 
     @classmethod
     def import_config(cls, config: dict, hidden_size: int) -> dict:
@@ -380,25 +384,25 @@ def get_converters(
             *cls.mixer_converter_class.get_converters(
                 config.mixer,
                 f"{fast_llm_prefix}.mixer",
-                f"{hf_prefix}.self_attn",
+                f"{hf_prefix}.{cls.hf_mixer_name}",
                 drop_on_export,
             ),
             *cls.mlp_converter_class.get_converters(
                 config.mlp,
                 f"{fast_llm_prefix}.mlp",
-                f"{hf_prefix}.mlp",
+                f"{hf_prefix}.{cls.hf_mlp_name}",
                 drop_on_export,
             ),
             *cls.normalization_converter_class.get_converters(
                 config.normalization,
                 f"{fast_llm_prefix}.norm_1",
-                f"{hf_prefix}.input_layernorm",
+                f"{hf_prefix}.{cls.hf_norm_1_name}",
                 drop_on_export,
             ),
             *cls.normalization_converter_class.get_converters(
                 config.normalization,
                 f"{fast_llm_prefix}.norm_2",
-                f"{hf_prefix}.post_attention_layernorm",
+                f"{hf_prefix}.{cls.hf_norm_2_name}",
                 drop_on_export,
             ),
         ]
diff --git a/fast_llm/models/gpt/conversion/mixtral.py b/fast_llm/models/gpt/conversion/mixtral.py
@@ -4,7 +4,7 @@
 from fast_llm.engine.checkpoint.external import SplitWeightConverter, WeightConverter
 from fast_llm.layers.decoder.mlp.config import MoEMLPConfig
 from fast_llm.models.gpt.conversion.config import MixtralCheckpointFormat
-from fast_llm.models.gpt.conversion.llama import LlamaMLPConverter, get_weight_and_bias_converters
+from fast_llm.models.gpt.conversion.llama import LlamaMLPConverter, MLPLayer2Converter, get_weight_and_bias_converters
 from fast_llm.models.gpt.conversion.mistral import (
     MistralBaseModelConverter,
     MistralBlockConverter,
@@ -50,16 +50,29 @@ def get_converters(
         return [
             *get_weight_and_bias_converters(
                 f"{fast_llm_prefix}.router",
-                () if drop_on_export else (f"{hf_prefix}.router",),
-                config.add_linear_biases,
+                f"{hf_prefix}.gate",
+                False,
+                drop_on_export=drop_on_export,
+            ),
+            *get_weight_and_bias_converters(
+                f"{fast_llm_prefix}.layer_1",
+                tuple(f"{hf_prefix}.experts.{i}.{w}" for i in range(config.experts) for w in ("w1", "w3")),
+                False,
                 SplitWeightConverter,
                 drop_on_export=drop_on_export,
             ),
-            *super().get_converters(config, fast_llm_prefix, hf_prefix, drop_on_export=drop_on_export),
+            *get_weight_and_bias_converters(
+                f"{fast_llm_prefix}.layer_2",
+                tuple(f"{hf_prefix}.experts.{i}.w2" for i in range(config.experts)),
+                False,
+                MLPLayer2Converter,
+                drop_on_export=drop_on_export,
+            ),
         ]
 
 
 class MixtralBlockConverter(MistralBlockConverter):
+    hf_mlp_name: typing.ClassVar[str] = "block_sparse_moe"
     mlp_converter_class: typing.ClassVar[type[MixtralMLPConverter]] = MixtralMLPConverter