add fp8, update FA

predibase · May 22, 2024 · 9649fad · 9649fad
1 parent f207d40
commit 9649fad
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 2 deletions.
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
@@ -30,6 +30,7 @@ enum Quantization {
     Hqq_4bit,
     Hqq_3bit,
     Hqq_2bit,
+    Fp8,
 }
 
 impl std::fmt::Display for Quantization {
@@ -63,6 +64,9 @@ impl std::fmt::Display for Quantization {
             Quantization::Hqq_2bit => {
                 write!(f, "hqq-2bit")
             }
+            Quantization::Fp8 => {
+                write!(f, "fp8")
+            }
         }
     }
 }

diff --git a/server/Makefile-flash-att-v2 b/server/Makefile-flash-att-v2
@@ -1,11 +1,11 @@
-flash_att_v2_commit_cuda := 23e8fa5a263d1c7122bc46a86ef32030ee7130f9
+flash_att_v2_commit_cuda := v2.5.8
 flash_att_v2_commit_rocm := 2554f490101742ccdc56620a938f847f61754be6
 
 
 flash-attention-v2-cuda:
   # Clone flash attention
 	pip install -U packaging ninja  --no-cache-dir
-	git clone https://github.com/HazyResearch/flash-attention.git flash-attention-v2
+	git clone https://github.com/Dao-AILab/flash-attention.git flash-attention-v2
 
 build-flash-attention-v2-cuda: flash-attention-v2-cuda
 	cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_cuda)

diff --git a/server/lorax_server/cli.py b/server/lorax_server/cli.py
@@ -22,6 +22,7 @@ class Quantization(str, Enum):
     hqq_4bit = "hqq-4bit"
     hqq_3bit = "hqq-3bit"
     hqq_2bit = "hqq-2bit"
+    fp8 = "fp8"
 
 
 class Dtype(str, Enum):