nod-ai · dan-garvey · Sep 5, 2024 · May 21, 2024 · May 22, 2024 · May 22, 2024
diff --git a/sharktank/sharktank/examples/export_paged_llm_v1.py b/sharktank/sharktank/examples/export_paged_llm_v1.py
@@ -16,7 +16,7 @@
 
 # TODO: Should be using a base class with the protocol supported.
 from ..models.llama.llama import LlamaModelConfig, PagedLlamaModelV1
-
+from ..models.mixtral.mixtral import *
 
 def main():
     from ..utils import cli
@@ -52,8 +52,8 @@ def main():
     llama_config = LlamaModelConfig(hp)
     llama_config.static_tables = False  # Rely on the compiler for hoisting tables.
     llama_config.kv_cache_type = "direct" if args.bs == [1] else "paged"
-    model = PagedLlamaModelV1(dataset.root_theta, llama_config)
-
+    #model = PagedLlamaModelV1(dataset.root_theta, llama_config)
+    model = PagedMixtralModelV1(dataset.root_theta, llama_config)
     def generate_params_json(hp, prefill_bs: list[int], decode_bs: list[int]):
         return {
             "module_name": "module",

diff --git a/sharktank/sharktank/examples/paged_llm_v1.py b/sharktank/sharktank/examples/paged_llm_v1.py
@@ -17,6 +17,7 @@
 from ..types import *
 
 # TODO: Should be using a base class with the protocol supported.
+from ..models.mixtral.mixtral import *
 from ..models.llama.llama import *
 from ..utils.debugging import trace_tensor
 from ..utils.tokenizer import InferenceTokenizer, load_tokenizer
@@ -236,12 +237,12 @@ def main():
         activation_dtype=activation_dtype,
         attention_dtype=activation_dtype,
     )
-    model = PagedLlamaModelV1(dataset.root_theta, config)
-    if args.save_intermediates_path:
-        from ..utils.patching import SaveModuleResultTensorsPatch
 
-        intermediates_saver = SaveModuleResultTensorsPatch()
-        intermediates_saver.patch_child_modules(model)
+    if config.hp.expert_count:
+        model = PagedMixtralModelV1(dataset.root_theta, config)
+    else:
+        model = PagedLlamaModelV1(dataset.root_theta, config)
+
     generator = TorchGenerator(model, tokenizer)
 
     print(f":: Prompting:")

diff --git a/sharktank/sharktank/examples/validate_direct_mixtral_model.py b/sharktank/sharktank/examples/validate_direct_mixtral_model.py
@@ -0,0 +1,161 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import sys
+
+import torch
+
+from sharktank.layers import *
+from sharktank.types import *
+from sharktank.models.mixtral.mixtral import *
+
+
+def main(args: list[str]):
+    from ..utils import cli
+
+    torch.no_grad().__enter__()
+
+    parser = cli.create_parser()
+    cli.add_input_dataset_options(parser)
+    args = cli.parse(parser)
+
+    dataset = cli.get_input_dataset(args)
+    hp = configs.LlamaHParams.from_gguf_props(dataset.properties)
+    llama_config = LlamaModelConfig(hp)
+    llama_config.kv_cache_type = "direct"
+    llama_config.activation_dtype = torch.float16
+    model = PagedMixtralModelV1(dataset.root_theta, llama_config)
+
+    # bs ("batch size") == 1
+    cache_state = model.cache.allocate(bs=1)
+
+    start_index = 0
+    tokens = torch.tensor(
+        [
+            [
+                1,
+                1059,
+                31871,
+                1217,
+                322,
+                266,
+                3682,
+                6075,
+                31902,
+                13,
+                31849,
+                31871,
+                0,
+                0,
+                0,
+                0,
+            ]
+            + 48 * [0],
+        ]
+    )
+    assert tokens.shape[1] % model.cache.block_seq_stride == 0
+    seq_block_ids = torch.tensor(
+        [
+            [127, 0, 0, 0],
+        ]
+    )
+
+    # Important: Do not use a sequence length of 0 for empty batch slots
+    # as it will cause softmax to nan due to a mask of all -inf. This then
+    # propagates and causes badness.
+    seq_lens = torch.tensor([12])
+
+    attention_mask = model.attention_mask(
+        model.input_mask(seq_lens, tokens.shape[1]),
+    )
+
+    print(f"Step {start_index}")
+    logits = model.prefill(
+        tokens,
+        attention_mask=attention_mask,
+        seq_block_ids=seq_block_ids,
+        cache_state=cache_state,
+    )
+    # TODO: Normalize the output of extract_tokens_from_logits into tensor [bs, 1].
+    tokens = torch.tensor(model.extract_tokens_from_logits(logits, seq_lens)).unsqueeze(
+        1
+    )
+    print(f"  : tokens = {tokens}")
+    # TODO(scotttodd): flatten then print? or index into full tensor?
+    # print(f"  : cache[127] = {cache_state[0][127]}")
+    # print(f"  : cache[126] = {cache_state[0][126]}")
+    # print(f"  : cache[0] = {cache_state[0][0]}")
+    # print(f"  : cache[1] = {cache_state[0][1]}")
+
+    # Decode a step.
+    print("Decoding...")
+    print(tokens.shape, tokens)
+    start_positions = torch.tensor([12])
+    seq_lens = seq_lens + 1
+    decode_attention_mask = model.decode_attention_mask(
+        model.input_mask(
+            seq_lens,
+            seq_block_ids.shape[1] * model.cache.block_seq_stride,
+        ),
+    )
+    logits = model.decode(
+        tokens,
+        attention_mask=decode_attention_mask,
+        start_positions=start_positions,
+        seq_block_ids=seq_block_ids,
+        cache_state=cache_state,
+    )
+    tokens = torch.tensor(model.extract_tokens_from_logits(logits, [1])).unsqueeze(1)
+    print(f"  : tokens = {tokens}")
+    # print(f"  : cache[127] = {cache_state[0][127]}")
+    # print(f"  : cache[126] = {cache_state[0][126]}")
+    # print(f"  : cache[0] = {cache_state[0][0]}")
+    # print(f"  : cache[1] = {cache_state[0][1]}")
+
+    # from sharktank.models import llama
+    # print(f"+++PREFILL XK = {llama.DEBUG_PREFILL_XK.shape}\n{llama.DEBUG_PREFILL_XK}")
+    # print(f"+++DECODE  XK = {llama.DEBUG_DECODE_XK.shape}\n{llama.DEBUG_DECODE_XK}")
+    # torch.testing.assert_close(llama.DEBUG_PREFILL_XK, llama.DEBUG_DECODE_XK)
+
+    def save_prefill_module(model):
+        from iree.compiler.extras.fx_importer import FxImporter
+        from iree.compiler.ir import AsmState
+
+        importer = FxImporter()
+        # asm_state = AsmState(importer.module_op)
+
+        print("Generating FX graph")
+
+        class InferenceModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.add_module("prefill", model)
+
+            def forward(self, tokens, attention_mask, seq_block_ids, *cache_state):
+                return self.prefill.prefill(
+                    tokens,
+                    attention_mask=attention_mask,
+                    seq_block_ids=seq_block_ids,
+                    cache_state=list(cache_state),
+                )
+
+        infmod = InferenceModule()
+        prog = torch.export.export(
+            infmod, (tokens, attention_mask, seq_block_ids) + tuple(cache_state)
+        )
+
+        print(f"FX prog:", prog)
+        importer.import_program(prog, func_name="prefill")
+        output_file = "/tmp/prefill.mlirbc"
+        print("Saving to:", output_file)
+        with open(output_file, "wb") as f:
+            importer.module_op.write_bytecode(f)
+
+    # save_prefill_module()
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/sharktank/sharktank/examples/validate_mixtral_ref_model.py b/sharktank/sharktank/examples/validate_mixtral_ref_model.py
@@ -0,0 +1,48 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import sys
+
+import torch
+
+from sharktank.layers import *
+from sharktank.types import *
+from sharktank.models.mixtral.mixtral_ref import *
+
+
+def main(args: list[str]):
+    from ..utils import cli
+
+    torch.no_grad().__enter__()
+
+    parser = cli.create_parser()
+    cli.add_input_dataset_options(parser)
+    args = cli.parse(parser)
+
+    dataset = cli.get_input_dataset(args)
+    hp = configs.LlamaHParams.from_gguf_props(dataset.properties)
+    ref_llama_config = RefLlamaModelConfig(hp)
+    ref_llama_config.activation_dtype = torch.float16
+    model = DirectCacheMixtralModelV1(dataset.root_theta, ref_llama_config)
+
+    kv_cache = model.create_cache(bs=1)
+    start_index = 0
+    next_tokens = [1, 1059, 31871, 1217, 322, 266, 3682, 6075, 31902, 13, 31849, 31871]
+    print(f"Step {start_index}")
+    tokens = model.forward(
+        torch.tensor([next_tokens]), start_index=start_index, local_kv_cache=kv_cache
+    )
+    print(f"  : tokens = {tokens}")
+
+    # Decode a step.
+    print("Decoding...")
+    print(tokens.shape, tokens)
+    decode_token = model.forward(tokens, start_index=12, local_kv_cache=kv_cache)
+    print(f"  : decode tokens = {decode_token}")
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/sharktank/sharktank/layers/__init__.py b/sharktank/sharktank/layers/__init__.py
@@ -12,5 +12,10 @@
 from .norm import RMSNormLayer
 from .rotary_embedding import RotaryEmbeddingLayer
 from .token_embedding import TokenEmbeddingLayer
+from .llama_attention_block import LlamaAttentionBlock
+from .paged_llama_attention_block import PagedLlamaAttentionBlock
+from .ffn_block import FFN
+from .ffn_moe_block import FFNMOE
+from .mixture_of_experts_block import SparseMoeBlock
 
 from . import configs
diff --git a/sharktank/sharktank/layers/base.py b/sharktank/sharktank/layers/base.py
@@ -16,11 +16,8 @@
 from ..utils import debugging
 
 __all__ = [
-    "LinearLayer",
-    "RotaryEmbeddingLayer",
-    "RMSNormLayer",
+    "BaseLayer",
     "ThetaLayer",
-    "TokenEmbedding",
 ]
 
 

diff --git a/sharktank/sharktank/layers/configs/llm_configs.py b/sharktank/sharktank/layers/configs/llm_configs.py
@@ -19,9 +19,7 @@
 
 import torch
 
-__all__ = [
-    "LlamaHParams",
-]
+__all__ = ["LlamaHParams"]
 
 
 @dataclass
@@ -36,27 +34,43 @@ class LlamaHParams:
     block_count: int
     feed_forward_length: int
     rope_dimension_count: int
+    rope_freq_base: float
     attention_head_count: int
     attn_head_dim: int
     attention_layer_norm_rms_epsilon: float
     attention_head_count_kv: int
+    expert_count: int
+    expert_used_count: int
 
     @staticmethod
     def from_gguf_props(p: dict[str, Any]):
-        attention_head_count = _int_prop(p, "llama.attention.head_count")
+        default_expert_count = 0
+        default_expert_used_count = 0
+        default_rope_freq_base = 10000.0
+        attention_head_count = _int_prop(p, "grok.attention.head_count")
+
         return LlamaHParams(
-            context_length=_int_prop(p, "llama.context_length"),
-            embedding_length=_int_prop(p, "llama.embedding_length"),
-            block_count=_int_prop(p, "llama.block_count"),
-            feed_forward_length=_int_prop(p, "llama.feed_forward_length"),
-            attn_head_dim=_int_prop(p, "llama.rope.dimension_count"),
-            rope_dimension_count=_int_prop(p, "llama.rope.dimension_count"),
+            context_length=_int_prop(p, "grok.context_length"),
+            embedding_length=_int_prop(p, "grok.embedding_length"),
+            block_count=_int_prop(p, "grok.block_count"),
+            feed_forward_length=_int_prop(p, "grok.feed_forward_length"),
+            attn_head_dim=128,#_int_prop(p, "grok.rope.dimension_count"),
+            rope_dimension_count=128,#_int_prop(p, "grok.rope.dimension_count"),
             attention_head_count=attention_head_count,
             attention_layer_norm_rms_epsilon=_float_prop(
-                p, "llama.attention.layer_norm_rms_epsilon"
+                p, "grok.attention.layer_norm_rms_epsilon"
             ),
             attention_head_count_kv=_optional_int_prop(
-                p, "llama.attention.head_count_kv", attention_head_count
+                p, "grok.attention.head_count_kv", attention_head_count
+            ),
+            rope_freq_base=_optional_float_prop(
+                p, "grok.rope.freq_base", default_rope_freq_base
+            ),
+            expert_count=_optional_int_prop(
+                p, "grok.expert_count", default_expert_count
+            ),
+            expert_used_count=_optional_int_prop(
+                p, "grok.expert_used_count", default_expert_used_count
             ),
         )
 
@@ -79,10 +93,16 @@ def _int_prop(p: dict[str, Any], name: str) -> int:
         raise KeyError(f"Property '{name}' not found (among keys {p.keys()})")
 
 
+def _optional_float_prop(p: dict[str, Any], name: str, default_value: float) -> float:
+    value = p.get(name, default_value)
+    try:
+        return float(value)
+    except ValueError as e:
+        raise ValueError(f"Property '{name}' expected to be a float and was not") from e
+
+
 def _optional_int_prop(p: dict[str, Any], name: str, default_value: int) -> int:
-    value = p[name]
-    if value is None:
-        return default_value
+    value = p.get(name, default_value)
     try:
         return int(value)
     except ValueError as e: