predibase · arnavgarg1 · Jul 3, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 1, 2024
diff --git a/docs/models/base_models.md b/docs/models/base_models.md
@@ -8,6 +8,7 @@
   - [Zephyr](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta)
 - 🔄 [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
 - 💎 [Gemma](https://blog.google/technology/developers/gemma-open-models/)
+  - [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
 - 🏛️ [Phi-3](https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/) / [Phi-2](https://huggingface.co/microsoft/phi-2)
 - 🔮 [Qwen2 / Qwen](https://huggingface.co/Qwen)
 - 🗣️ [Command-R](https://docs.cohere.com/docs/command-r)

diff --git a/server/lorax_server/cli.py b/server/lorax_server/cli.py
@@ -79,7 +79,7 @@ def serve(
     dtype = None if dtype is None else dtype.value
     if dtype is not None and quantize is not None:
         raise RuntimeError(
-            "Only 1 can be set between `dtype` and `quantize`, as they both decide how goes the final model."
+            "Only 1 can be set between `dtype` and `quantize`, as they both decide how the final model is initialized."
         )
     server.serve(
         model_id,

diff --git a/server/lorax_server/models/__init__.py b/server/lorax_server/models/__init__.py
@@ -15,6 +15,7 @@
 from lorax_server.models.seq2seq_lm import Seq2SeqLM
 from lorax_server.models.t5 import T5Sharded
 from lorax_server.utils.sources import get_s3_model_local_dir
+from lorax_server.utils.torch_utils import is_bf16_supported
 
 # The flag below controls whether to allow TF32 on matmul. This flag defaults to False
 # in PyTorch 1.12 and later.
@@ -72,15 +73,21 @@ def get_model(
         raise ValueError(f"Unknown source {source}")
 
     model_type = config_dict["model_type"]
+    dtype = dtype or config_dict.get("torch_dtype", "float16")
 
-    if dtype is None:
-        dtype = torch.float16
-    elif dtype == "float16":
+    if dtype in {"float16", "float32"}:
         dtype = torch.float16
     elif dtype == "bfloat16":
-        dtype = torch.bfloat16
+        if is_bf16_supported():
+            dtype = torch.bfloat16
+        else:
+            logger.warning("bfloat16 is not supported on this device, falling back to float16")
+            dtype = torch.float16
     else:
-        raise RuntimeError(f"Unknown dtype {dtype}")
+        try:
+            dtype = getattr(torch, dtype)
+        except AttributeError:
+            raise RuntimeError(f"Unknown dtype {dtype}")
 
     if "facebook/galactica" in model_id:
         return GalacticaSharded(

diff --git a/server/lorax_server/utils/torch_utils.py b/server/lorax_server/utils/torch_utils.py
@@ -0,0 +1,10 @@
+import torch
+
+
+def is_bf16_supported() -> bool:
+    """Check if the current GPU supports bfloat16.
+
+    Returns:
+        True if supported, False otherwise.
+    """
+    return torch.cuda.is_available() and torch.cuda.is_bf16_supported()