predibase · tgaddair · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024
diff --git a/docs/models/adapters.md b/docs/models/adapters.md
@@ -36,13 +36,15 @@ Any combination of linear layers can be targeted in the adapters, which correspo
 - `o_proj`
 - `lm_head`
 
-### Qwen
+### Gemma
 
-- `c_attn`
-- `c_proj`
-- `w1`
-- `w2`
-- `lm_head`
+- `q_proj`
+- `k_proj`
+- `v_proj`
+- `o_proj`
+- `gate_proj`
+- `up_proj`
+- `down_proj`
 
 ### Phi
 
@@ -54,6 +56,14 @@ Any combination of linear layers can be targeted in the adapters, which correspo
 - `fc2`
 - `lm_head`
 
+### Qwen
+
+- `c_attn`
+- `c_proj`
+- `w1`
+- `w2`
+- `lm_head`
+
 ### GPT2
 
 - `c_attn`

diff --git a/docs/models/base_models.md b/docs/models/base_models.md
@@ -7,8 +7,9 @@
 - 🌬️[Mistral](https://huggingface.co/mistralai)
     - [Zephyr](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta)
 - 🔄 [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
-- 🔮 [Qwen](https://huggingface.co/Qwen)
+- 💎 [Gemma](https://blog.google/technology/developers/gemma-open-models/)
 - 🏛️ [Phi](https://huggingface.co/microsoft/phi-2)
+- 🔮 [Qwen](https://huggingface.co/Qwen)
 - 🤖 [GPT2](https://huggingface.co/gpt2)
 - 🌸 [Bloom](https://huggingface.co/bigscience/bloom)
 

diff --git a/server/lorax_server/models/__init__.py b/server/lorax_server/models/__init__.py
@@ -50,6 +50,7 @@
     from lorax_server.models.flash_rw import FlashRWSharded
     from lorax_server.models.flash_neox import FlashNeoXSharded
     from lorax_server.models.flash_llama import FlashLlama
+    from lorax_server.models.flash_gemma import FlashGemma
     from lorax_server.models.flash_gpt2 import FlashGPT2
     from lorax_server.models.flash_qwen import FlashQwen
     from lorax_server.models.flash_phi import FlashPhi
@@ -66,6 +67,7 @@
     __all__.append(FlashRWSharded)
     __all__.append(FlashSantacoderSharded)
     __all__.append(FlashLlama)
+    __all__.append(FlashGemma)
     __all__.append(FlashGPT2)
     __all__.append(FlashQwen)
     __all__.append(FlashPhi)
@@ -361,6 +363,20 @@ def get_model(
                 trust_remote_code=trust_remote_code,
             )
         raise NotImplementedError("Phi model requires flash attention v2")
+
+    if model_type == "gemma":
+        if FLASH_ATTENTION:
+            return FlashGemma(
+                model_id,
+                adapter_id,
+                adapter_source,
+                revision,
+                quantize=quantize,
+                compile=compile,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+        raise NotImplementedError("Gemma model requires flash attention v2")
 
     if model_type == "opt":
         return OPTSharded(